## 1. Import All Packages

In [1]:
%%time
import numpy as np
import math
import pandas as pd
import pickle, os, math
from keras.models import Sequential
from keras.callbacks import Callback
from keras.layers import Dense
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from tqdm import tqdm

Using TensorFlow backend.


Wall time: 7.49 s


## 2. Read data

In [111]:
%%time
data_dir = os.path.join(os.curdir, 'Data', 'Score', 'data_pics.pkl')
with open(data_dir, 'rb') as in_file:
    ot = pickle.load(in_file)
data_pics = ot['data']
target_pics = ot['target']
print(data_pics.shape)

(400001, 162)
Wall time: 1.06 s


As in the case of survival training, score training also has unbalanced class, the ratio of class 0 to 1 is almost 14:1.

In [154]:
X_train, X_test, y_train, y_test = train_test_split(
    data_pics, target_pics, test_size=0.1, random_state=152)
print(X_train.shape)
X_train_train, X_vali, y_train_train, y_vali = train_test_split(
    X_train, y_train, test_size=0.3, random_state=15545)
print('class 0 has ' + str(len(y_train.index[y_train[0] == 0].tolist())) + ' points')
print('class 1 has ' + str(len(y_train.index[y_train[0] == 1].tolist())) + ' points')

(360000, 162)
class 0 has 335342 points
class 1 has 24658 points


To create a balanced dataset, use all class 1 and randomly select the same of number class 0 data.

In [113]:
index_0 = y_train.index[y_train[0] == 0].tolist()
index_1 = y_train.index[y_train[0] != 0].tolist()
index_0_comparable_to_1 = np.random.choice(index_0, math.floor(len(index_1) * 1))
samples = np.concatenate([index_1, index_0_comparable_to_1])
print(str(len(index_1)) + ' + ' + str(len(index_0_comparable_to_1)) + ' = ' + str(len(samples)))

24658 + 24658 = 49316


In [114]:
small_data = new_data_df.iloc[samples, :]
small_target = target_pics.iloc[samples, :]

X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(
    small_data, small_target, test_size=0.3, random_state=1152)
X_train_train_small, X_vali_small, y_train_train_small, y_vali_small = train_test_split(
    X_train_small, y_train_small, test_size=0.3, random_state=8155)

Ravelling the target data.

In [115]:
y_train_small_m = np.ravel(y_train_small)
y_test_small_m = np.ravel(y_test_small)
y_train_train_small_m = np.ravel(y_train_train_small)
y_vali_small_m = np.ravel(y_vali_small)
y_train_m = np.ravel(y_train)
y_test_m = np.ravel(y_test)
y_train_train_m = np.ravel(y_train_train)
y_vali_m = np.ravel(y_vali)

## 3. Training

### Support Vector Classification

In [116]:
clf_sc = SVC(C=10.0, gamma='auto', kernel='rbf', verbose=True)
clf_sc.fit(X_train_small, y_train_small_m)
clf_sc.score(X_test_small, y_test_small_m)

[LibSVM]

0.5237580263602568

In [117]:
ypred = clf_sc.predict(X_test_small)
print(sum(ypred) / len(ypred))
test = [idd for idd, value in enumerate(ypred) if value == 0 ]

0.9539033457249071


### Multi Layer Perceptor Classifier

MLPC apparently failed at generalizing the inner trend, all the predictions turne out to be all 0 or all 1.

In [127]:
mlpc_sc = MLPClassifier(hidden_layer_sizes=(100, 50, 40, 20),
                        alpha=0.15, max_iter=1000, batch_size=1000,
                        verbose=True, learning_rate_init=0.01, tol=1e-5,
                        learning_rate='adaptive')

mlpc_sc.fit(X_train_small, y_train_small_m)
mlpc_sc.score(X_test_small, y_test_small_m)

Iteration 1, loss = 5.89562739
Iteration 2, loss = 0.75283252
Iteration 3, loss = 0.71505717
Iteration 4, loss = 0.71154901
Iteration 5, loss = 0.71145618
Iteration 6, loss = 0.71114702
Iteration 7, loss = 0.71182472
Iteration 8, loss = 0.71251703
Iteration 9, loss = 0.71233927
Training loss did not improve more than tol=0.000010 for two consecutive epochs. Stopping.


0.5042244001351808

In [128]:
ypred = mlpc_sc.predict(X_test_small)
sum(ypred) / len(ypred)

0.6161541061169314

## 4. Training Results

Apparently the score is really low.

Try to clip the data to reduce the number of freedom to avoid over fitting

In [129]:
small_data.shape

(49316, 162)

In [134]:
np.max(small_data.iloc[:, 161])

72

In [145]:
%%time
new_data = []
window_size = 5
for i in tqdm(range(small_data.shape[0])):
    left = int(small_data.iloc[i, 161] - window_size)
    right = int(small_data.iloc[i, 161] + window_size)
    new_data.append(small_data.iloc[i, (2 * left):(2 * right + 2)].values.tolist() + small_data.iloc[i, 160:].values.tolist())
new_data_df = pd.DataFrame(new_data)

100%|███████████████████████████████████| 49316/49316 [01:29<00:00, 551.73it/s]


Wall time: 1min 29s


In [146]:
new_data_df.shape

(49316, 24)

In [152]:
left_ind = 2 * (small_data.iloc[:, 161] - window_size)
right_ind = 2 * (small_data.iloc[:, 161] + window_size) + 2

In [155]:
small_data.shape

(49316, 162)

In [157]:
new_data_df

X_train_small_short, X_test_small_short, y_train_small_short, y_test_small_short = train_test_split(
    new_data_df, small_target, test_size=0.3, random_state=1152)
X_train_train_small_short, X_vali_small_short, y_train_train_small_short, y_vali_small_short = train_test_split(
    X_train_small_short, y_train_small_short, test_size=0.3, random_state=8155)

y_train_small_short_m = np.ravel(y_train_small_short)
y_test_small_short_m = np.ravel(y_test_small_short)
y_train_train_small_short_m = np.ravel(y_train_train_small_short)
y_vali_small_short_m = np.ravel(y_vali_small_short)

In [158]:
clf_sc_short = SVC(C=10.0, gamma='auto', kernel='rbf', verbose=True)
clf_sc_short.fit(X_train_small_short, y_train_small_short_m)
clf_sc_short.score(X_test_small_short, y_test_small_short_m)

[LibSVM]

0.5891855356539372

## 5. Basic

In [2]:
%%time
data_dir = os.path.join(os.curdir, 'Data', 'Score', 'data_basic.pkl')
with open(data_dir, 'rb') as in_file:
    ot = pickle.load(in_file)
data_pics = ot['data']
target_pics = ot['target']
print(data_pics.shape)

X_train, X_test, y_train, y_test = train_test_split(
    data_pics, target_pics, test_size=0.1, random_state=152)
print(X_train.shape)
X_train_train, X_vali, y_train_train, y_vali = train_test_split(
    X_train, y_train, test_size=0.3, random_state=15545)
print('class 0 has ' + str(len(y_train.index[y_train[0] == 0].tolist())) + ' points')
print('class 1 has ' + str(len(y_train.index[y_train[0] == 1].tolist())) + ' points')

index_0 = y_train.index[y_train[0] == 0].tolist()
index_1 = y_train.index[y_train[0] != 0].tolist()
index_0_comparable_to_1 = np.random.choice(index_0, math.floor(len(index_1) * 1))
samples = np.concatenate([index_1, index_0_comparable_to_1])
print(str(len(index_1)) + ' + ' + str(len(index_0_comparable_to_1)) + ' = ' + str(len(samples)))


small_data = data_pics.iloc[samples, :]
small_target = target_pics.iloc[samples, :]

X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(
    small_data, small_target, test_size=0.3, random_state=1152)
X_train_train_small, X_vali_small, y_train_train_small, y_vali_small = train_test_split(
    X_train_small, y_train_small, test_size=0.3, random_state=8155)

y_train_small_m = np.ravel(y_train_small)
y_test_small_m = np.ravel(y_test_small)
y_train_train_small_m = np.ravel(y_train_train_small)
y_vali_small_m = np.ravel(y_vali_small)
y_train_m = np.ravel(y_train)
y_test_m = np.ravel(y_test)
y_train_train_m = np.ravel(y_train_train)
y_vali_m = np.ravel(y_vali)

(99812, 221)
(89830, 221)
class 0 has 85117 points
class 1 has 4713 points
4713 + 4713 = 9426
Wall time: 756 ms


In [3]:
clf_st = SVC(C=10.0, gamma='auto', kernel='rbf', verbose=True)
clf_st.fit(X_train_small, y_train_small_m)
clf_st.score(X_test_small, y_test_small_m)

[LibSVM]

0.6135077793493635

In [4]:
ypred = clf_st.predict(X_test_small)
sum(ypred) / len(ypred)

0.5144978783592645

In [5]:
np.bincount(ypred - y_test_small_m + 1)

array([ 529, 1735,  564], dtype=int64)

In [6]:
mlpc_st = MLPClassifier(hidden_layer_sizes=(256, 128, 64, 32, 16, 8),
                        alpha=0.15, max_iter=1000, batch_size=2000,
                        verbose=True, learning_rate_init=0.01, tol=1e-5,
                        learning_rate='adaptive')

mlpc_st.fit(X_train_small, y_train_small_m)
mlpc_st.score(X_test_small, y_test_small_m)

Iteration 1, loss = 0.75222415
Iteration 2, loss = 0.70612536
Iteration 3, loss = 0.69621845
Iteration 4, loss = 0.69624756
Iteration 5, loss = 0.69016925
Iteration 6, loss = 0.68494090
Iteration 7, loss = 0.67667688
Iteration 8, loss = 0.66605519
Iteration 9, loss = 0.65784695
Iteration 10, loss = 0.66255118
Iteration 11, loss = 0.66646234
Iteration 12, loss = 0.66303735
Training loss did not improve more than tol=0.000010 for two consecutive epochs. Stopping.


0.6011315417256011

In [7]:
ypred = mlpc_st.predict(X_test_small)
sum(ypred) / len(ypred)

0.3302687411598303

In [8]:
np.bincount(ypred - y_test_small_m + 1)

array([ 807, 1700,  321], dtype=int64)

In [9]:
np.histogram(ypred - y_test_small_m + 1)

(array([ 807,    0,    0,    0,    0, 1700,    0,    0,    0,  321],
       dtype=int64),
 array([0. , 0.2, 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8, 2. ]))