## 1. Import All Packages

In [17]:
%%time
import numpy as np
import math
import pandas as pd
import pickle, os, math
from keras.models import Sequential
from keras.callbacks import Callback
from keras.layers import Dense
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from random import randint

Wall time: 0 ns


## 2. Read data

In [111]:
%%time
data_dir = os.path.join(os.curdir, 'Data', 'Score', 'data_pics.pkl')
with open(data_dir, 'rb') as in_file:
    ot = pickle.load(in_file)
data_pics = ot['data']
target_pics = ot['target']
print(data_pics.shape)

(400001, 162)
Wall time: 1.06 s


As in the case of survival training, score training also has unbalanced class, the ratio of class 0 to 1 is almost 14:1.

In [154]:
X_train, X_test, y_train, y_test = train_test_split(
    data_pics, target_pics, test_size=0.1, random_state=152)
print(X_train.shape)
X_train_train, X_vali, y_train_train, y_vali = train_test_split(
    X_train, y_train, test_size=0.3, random_state=15545)
print('class 0 has ' + str(len(y_train.index[y_train[0] == 0].tolist())) + ' points')
print('class 1 has ' + str(len(y_train.index[y_train[0] == 1].tolist())) + ' points')

(360000, 162)
class 0 has 335342 points
class 1 has 24658 points


To create a balanced dataset, use all class 1 and randomly select the same of number class 0 data.

In [113]:
index_0 = y_train.index[y_train[0] == 0].tolist()
index_1 = y_train.index[y_train[0] != 0].tolist()
index_0_comparable_to_1 = np.random.choice(index_0, math.floor(len(index_1) * 1))
samples = np.concatenate([index_1, index_0_comparable_to_1])
print(str(len(index_1)) + ' + ' + str(len(index_0_comparable_to_1)) + ' = ' + str(len(samples)))

24658 + 24658 = 49316


In [114]:
small_data = new_data_df.iloc[samples, :]
small_target = target_pics.iloc[samples, :]

X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(
    small_data, small_target, test_size=0.3, random_state=1152)
X_train_train_small, X_vali_small, y_train_train_small, y_vali_small = train_test_split(
    X_train_small, y_train_small, test_size=0.3, random_state=8155)

Ravelling the target data.

In [115]:
y_train_small_m = np.ravel(y_train_small)
y_test_small_m = np.ravel(y_test_small)
y_train_train_small_m = np.ravel(y_train_train_small)
y_vali_small_m = np.ravel(y_vali_small)
y_train_m = np.ravel(y_train)
y_test_m = np.ravel(y_test)
y_train_train_m = np.ravel(y_train_train)
y_vali_m = np.ravel(y_vali)

## 3. Training

### Support Vector Classification

In [116]:
clf_sc = SVC(C=10.0, gamma='auto', kernel='rbf', verbose=True)
clf_sc.fit(X_train_small, y_train_small_m)
clf_sc.score(X_test_small, y_test_small_m)

[LibSVM]

0.5237580263602568

In [117]:
ypred = clf_sc.predict(X_test_small)
print(sum(ypred) / len(ypred))
test = [idd for idd, value in enumerate(ypred) if value == 0 ]

0.9539033457249071


### Multi Layer Perceptor Classifier

MLPC apparently failed at generalizing the inner trend, all the predictions turne out to be all 0 or all 1.

In [127]:
mlpc_sc = MLPClassifier(hidden_layer_sizes=(100, 50, 40, 20),
                        alpha=0.15, max_iter=1000, batch_size=1000,
                        verbose=True, learning_rate_init=0.01, tol=1e-5,
                        learning_rate='adaptive')

mlpc_sc.fit(X_train_small, y_train_small_m)
mlpc_sc.score(X_test_small, y_test_small_m)

Iteration 1, loss = 5.89562739
Iteration 2, loss = 0.75283252
Iteration 3, loss = 0.71505717
Iteration 4, loss = 0.71154901
Iteration 5, loss = 0.71145618
Iteration 6, loss = 0.71114702
Iteration 7, loss = 0.71182472
Iteration 8, loss = 0.71251703
Iteration 9, loss = 0.71233927
Training loss did not improve more than tol=0.000010 for two consecutive epochs. Stopping.


0.5042244001351808

In [128]:
ypred = mlpc_sc.predict(X_test_small)
sum(ypred) / len(ypred)

0.6161541061169314

## 4. Training Results

Apparently the score is really low.

Try to clip the data to reduce the number of freedom to avoid over fitting

In [129]:
small_data.shape

(49316, 162)

In [134]:
np.max(small_data.iloc[:, 161])

72

In [145]:
%%time
new_data = []
window_size = 5
for i in tqdm(range(small_data.shape[0])):
    left = int(small_data.iloc[i, 161] - window_size)
    right = int(small_data.iloc[i, 161] + window_size)
    new_data.append(small_data.iloc[i, (2 * left):(2 * right + 2)].values.tolist() + small_data.iloc[i, 160:].values.tolist())
new_data_df = pd.DataFrame(new_data)

100%|███████████████████████████████████| 49316/49316 [01:29<00:00, 551.73it/s]


Wall time: 1min 29s


In [146]:
new_data_df.shape

(49316, 24)

In [152]:
left_ind = 2 * (small_data.iloc[:, 161] - window_size)
right_ind = 2 * (small_data.iloc[:, 161] + window_size) + 2

In [155]:
small_data.shape

(49316, 162)

In [157]:
new_data_df

X_train_small_short, X_test_small_short, y_train_small_short, y_test_small_short = train_test_split(
    new_data_df, small_target, test_size=0.3, random_state=1152)
X_train_train_small_short, X_vali_small_short, y_train_train_small_short, y_vali_small_short = train_test_split(
    X_train_small_short, y_train_small_short, test_size=0.3, random_state=8155)

y_train_small_short_m = np.ravel(y_train_small_short)
y_test_small_short_m = np.ravel(y_test_small_short)
y_train_train_small_short_m = np.ravel(y_train_train_small_short)
y_vali_small_short_m = np.ravel(y_vali_small_short)

In [158]:
clf_sc_short = SVC(C=10.0, gamma='auto', kernel='rbf', verbose=True)
clf_sc_short.fit(X_train_small_short, y_train_small_short_m)
clf_sc_short.score(X_test_small_short, y_test_small_short_m)

[LibSVM]

0.5891855356539372

## 5. Basic

Including three datasets, the average accuracy can be 68% ~ 69%

Including only 68%, the avearge accuracy is 68% ~ 69%

Including 68% (2 freq) and 8 freq, acc: array([0.68019124, 0.7070681 , 0.70078842, 0.70259791, 0.7028564 ])

Including 68% and 4 freq, acc: array([0.67455247, 0.65475605, 0.67181467, 0.65637066, 0.67401531])

Including 8 freq and 4 freq: array([0.57640449, 0.59867211, 0.60010215, 0.58804903, 0.63019716])

Including 1 freq (fires1), and 68% and 8 freq: array([0.74909654, 0.75905688, 0.76051975, 0.77342742, 0.73924269]) 

In [103]:
%%time

data_dir = os.path.join(os.curdir, 'Data', 'Score', '400000_68per', 'data_basic.pkl')
with open(data_dir, 'rb') as in_file:
    ot = pickle.load(in_file)
data_pics_0 = ot['data']
target_pics_0 = ot['target']
print(data_pics_0.shape)

data_dir = os.path.join(os.curdir, 'Data', 'Score', '800000_4freq', 'data_basic.pkl')
with open(data_dir, 'rb') as in_file:
    ot = pickle.load(in_file)
data_pics_1 = ot['data']
target_pics_1 = ot['target']

data_dir = os.path.join(os.curdir, 'Data', 'Score', '400000_1freq_fires4', 'data_basic.pkl')
with open(data_dir, 'rb') as in_file:
    ot = pickle.load(in_file)
data_pics_2 = ot['data']
target_pics_2 = ot['target']

data_dir = os.path.join(os.curdir, 'Data', 'Score', '400000_1freq_fires1', 'data_basic.pkl')
with open(data_dir, 'rb') as in_file:
    ot = pickle.load(in_file)
data_pics_3 = ot['data']
target_pics_3 = ot['target']
print(data_pics_3.shape)


data_dir = os.path.join(os.curdir, 'Data', 'Score', '800000_8freq', 'data_basic.pkl')
with open(data_dir, 'rb') as in_file:
    ot = pickle.load(in_file)
data_pics = ot['data']
target_pics = ot['target']
print(data_pics.shape)

'''
data_pics = data_pics.append(data_pics_0)
target_pics = target_pics.append(target_pics_0)
'''

'''
data_pics = data_pics.append(data_pics_1)
target_pics = target_pics.append(target_pics_1)

data_pics = data_pics.append(data_pics_1)
target_pics = target_pics.append(target_pics_1)
'''

data_pics = data_pics.append(data_pics_0).append(data_pics_3)
target_pics = target_pics.append(target_pics_0).append(target_pics_3)


data_pics.reset_index(inplace=True, drop=True)
target_pics.reset_index(inplace=True, drop=True)

print(data_pics.shape)

(90029, 221)
(41569, 221)
(599671, 221)
(731269, 221)
Wall time: 13.1 s


In [104]:
X_train, X_test, y_train, y_test = train_test_split(
    data_pics, target_pics, test_size=0.1, random_state=randint(100, 10000))
print(X_train.shape)
X_train_train, X_vali, y_train_train, y_vali = train_test_split(
    X_train, y_train, test_size=0.3, random_state=randint(100, 10000))
print('class 0 has ' + str(len(y_train.index[y_train[0] == 0].tolist())) + ' points')
print('class 1 has ' + str(len(y_train.index[y_train[0] == 1].tolist())) + ' points')

index_0 = y_train.index[y_train[0] == 0].tolist()
index_1 = y_train.index[y_train[0] != 0].tolist()
index_0_comparable_to_1 = np.random.choice(index_0, math.floor(len(index_1) * 1))
samples = np.concatenate([index_1, index_0_comparable_to_1])
print(str(len(index_1)) + ' + ' + str(len(index_0_comparable_to_1)) + ' = ' + str(len(samples)))


small_data = data_pics.iloc[samples, :]
small_target = target_pics.iloc[samples, :]

X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(
    small_data, small_target, test_size=0.3, random_state=randint(100, 10000))
X_train_train_small, X_vali_small, y_train_train_small, y_vali_small = train_test_split(
    X_train_small, y_train_small, test_size=0.3, random_state=randint(100, 10000))

y_train_small_m = np.ravel(y_train_small)
y_test_small_m = np.ravel(y_test_small)
y_train_train_small_m = np.ravel(y_train_train_small)
y_vali_small_m = np.ravel(y_vali_small)
y_train_m = np.ravel(y_train)
y_test_m = np.ravel(y_test)
y_train_train_m = np.ravel(y_train_train)
y_vali_m = np.ravel(y_vali)

(658142, 221)
class 0 has 616562 points
class 1 has 41580 points
41580 + 41580 = 83160


In [105]:
x = small_target.values.tolist()
x = np.ravel(x)
print(x)
np.bincount(x)

[1 1 1 ... 0 0 0]


array([41580, 41580], dtype=int64)

In [106]:
mlpc_st = MLPClassifier(hidden_layer_sizes=(256, 128, 64, 32, 16, 8),
                        alpha=0.15, max_iter=1000, batch_size=2000,
                        verbose=True, learning_rate_init=0.01, tol=1e-5,
                        learning_rate='adaptive')

mlpc_st.fit(X_train_small, y_train_small_m)
mlpc_st.score(X_test_small, y_test_small_m)

Iteration 1, loss = 0.73381342
Iteration 2, loss = 0.55798219
Iteration 3, loss = 0.55066806
Iteration 4, loss = 0.54355051
Iteration 5, loss = 0.54223324
Iteration 6, loss = 0.53048772
Iteration 7, loss = 0.52053622
Iteration 8, loss = 0.51047660
Iteration 9, loss = 0.51031375
Iteration 10, loss = 0.49080060
Iteration 11, loss = 0.48963562
Iteration 12, loss = 0.47814060
Iteration 13, loss = 0.47915779
Iteration 14, loss = 0.47466764
Iteration 15, loss = 0.47171432
Iteration 16, loss = 0.46654419
Iteration 17, loss = 0.46268823
Iteration 18, loss = 0.46066742
Iteration 19, loss = 0.46012901
Iteration 20, loss = 0.45986168
Iteration 21, loss = 0.45422075
Iteration 22, loss = 0.44955791
Iteration 23, loss = 0.44995637
Iteration 24, loss = 0.44651431
Iteration 25, loss = 0.45177304
Iteration 26, loss = 0.44812825
Iteration 27, loss = 0.44392855
Iteration 28, loss = 0.44494537
Iteration 29, loss = 0.44041646
Iteration 30, loss = 0.43742769
Iteration 31, loss = 0.43657494
Iteration 32, los

0.7557319223985891

In [107]:
cross_val_score(mlpc_st, X_train_small, y_train_small_m, cv=5)

Iteration 1, loss = 0.72594199
Iteration 2, loss = 0.57145892
Iteration 3, loss = 0.54991985
Iteration 4, loss = 0.54795765
Iteration 5, loss = 0.54219803
Iteration 6, loss = 0.53751368
Iteration 7, loss = 0.53595661
Iteration 8, loss = 0.52083960
Iteration 9, loss = 0.51265405
Iteration 10, loss = 0.49986759
Iteration 11, loss = 0.48591920
Iteration 12, loss = 0.48048876
Iteration 13, loss = 0.47745275
Iteration 14, loss = 0.46717925
Iteration 15, loss = 0.46286448
Iteration 16, loss = 0.46270855
Iteration 17, loss = 0.45352533
Iteration 18, loss = 0.44976491
Iteration 19, loss = 0.44582241
Iteration 20, loss = 0.44402968
Iteration 21, loss = 0.43428263
Iteration 22, loss = 0.42888618
Iteration 23, loss = 0.42682679
Iteration 24, loss = 0.42161322
Iteration 25, loss = 0.42311242
Iteration 26, loss = 0.41624012
Iteration 27, loss = 0.41394818
Iteration 28, loss = 0.41469066
Iteration 29, loss = 0.41073494
Iteration 30, loss = 0.40880544
Iteration 31, loss = 0.41429679
Iteration 32, los

array([0.7367517 , 0.75736494, 0.75229752, 0.77100155, 0.77725281])

In [109]:
import joblib
joblib.dump(mlpc_st, os.path.join(os.curdir,'Models', 'model_score_75_method_1.joblib'))
mlpc_st = joblib.load(os.path.join(os.curdir,'Models', 'model_score_75_method_1.joblib'))

In [22]:
ypred = mlpc_st.predict(X_test_small)
sum(ypred) / len(ypred)

0.16390279201591992

In [23]:
np.bincount(ypred - y_test_small_m + 1)

array([ 2442, 12649,  1492], dtype=int64)

In [9]:
np.histogram(ypred - y_test_small_m + 1)

(array([ 807,    0,    0,    0,    0, 1700,    0,    0,    0,  321],
       dtype=int64),
 array([0. , 0.2, 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8, 2. ]))

SVC is working too slow

In [3]:
clf_st = SVC(C=10.0, gamma='auto', kernel='rbf', verbose=True)
clf_st.fit(X_train_small, y_train_small_m)
x = clf_st.score(X_test_small, y_test_small_m)
print(x)
ypred = clf_st.predict(X_test_small)
print(sum(ypred) / len(ypred))
print(np.bincount(ypred - y_test_small_m + 1))

[LibSVM]

0.6135077793493635