### 1. Import all the packages

In [77]:
import numpy as np
import math
import pandas as pd
import pickle, os, math
from keras.models import Sequential
from keras.callbacks import Callback
from keras.layers import Dense
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

### 2.1 Import the data first

There has been a huge mistake in the data generation. The mistake lies in the swap of x and y positions of the fighter. With the original data the accuracy stays constant around 50% in the balanced classes dataset, which shows that the data is not good enough.

In [78]:
data_dir = os.path.join(os.curdir, 'Data', 'basic_data_short.pkl')
with open(data_dir, 'rb') as in_file:
    ot = pickle.load(in_file)
data = ot['data']
target = ot['target']
data.shape

(10036, 20)

In [79]:
sums = [None for i in range(data.shape[1])]
for i in range(data.shape[1]):
    sums[i] = np.sum(data.iloc[:, i])

with help to [Thong Nguyen](https://medium.com/@thongonary/how-to-compute-f1-score-for-each-epoch-in-keras-a1acd17715a2)

In [80]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: %f — val_precision: %f — val_recall %f" % (_val_f1, _val_precision, _val_recall))
        return

metrics_usrdefined = Metrics()

### 2.2 Preprocess data

The first raw data is unbalanced with classes

In [81]:
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.1, random_state=152)
X_train.shape
X_train_train, X_vali, y_train_train, y_vali = train_test_split(
    X_train, y_train, test_size=0.3, random_state=152)

Create balanced classes sample data

In [82]:
index_0 = y_train.index[y_train[0] == 0].tolist()
print(len(index_0))
index_1 = y_train.index[y_train[0] != 0].tolist()
index_1_comparable_to_0 = np.random.choice(index_1, math.floor(len(index_0) * 1))
print(len(index_1_comparable_to_0))
samples = np.concatenate([index_0, index_1_comparable_to_0])
print(len(samples))

751
751
1502


In [83]:
small_data = data.iloc[samples, :]
small_target = target.iloc[samples, :]

In [84]:
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(
    small_data, small_target, test_size=0.3, random_state=152)
X_train_train_small, X_vali_small, y_train_train_small, y_vali_small = train_test_split(
    X_train_small, y_train_small, test_size=0.3, random_state=152)

### 3. Try running neural network first with raw data

In [85]:
model = Sequential()
model.add(Dense(units=12, activation='relu', input_dim=20))
model.add(Dense(units=6, activation='relu'))
model.add(Dense(units=1, activation='linear'))
model.compile(loss='mean_squared_error',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [86]:
model.fit(X_train_train, y_train_train,
          validation_data=(X_vali, y_vali), 
          epochs=10, batch_size=256, verbose=1)

Train on 6322 samples, validate on 2710 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1bfe8ba8>

In [87]:
loss_and_metrics = model.evaluate(X_test, y_test, batch_size=128)
print(loss_and_metrics)

[0.1111068560545188, 0.8964143397798576]


### The result is 98.4 accurate???

It is a false hope. The model predicts everything to be one, and one takes up most of the data, so the result is really bad.

In [88]:
y_predict = model.predict(X_test, batch_size=None, verbose=0)
np.sum(y_predict) / len(y_predict)

0.895336379093003

### 4. Dealing with imbalanced classes

try to make the balanced classes

In [89]:
model_small = Sequential()
model_small.add(Dense(units=12, activation='relu', input_dim=20))
model_small.add(Dense(units=12, activation='relu'))
model_small.add(Dense(units=6, activation='relu'))
model_small.add(Dense(units=1, activation='sigmoid'))
model_small.compile(loss='mse',
                    optimizer='rmsprop',
                    metrics=['accuracy'])

In [90]:
model_small.fit(X_train_train_small, y_train_train_small,
          validation_data=(X_vali_small, y_vali_small), 
          epochs=30, batch_size=256, verbose=1)

Train on 735 samples, validate on 316 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1bfe8a58>

In [91]:
loss_and_metrics_small = model_small.evaluate(X_test_small, y_test_small, batch_size=128)
loss_and_metrics_small



[0.24085947527589396, 0.6141906911940902]

In [92]:
y_predict_small = model_small.predict(X_test_small, batch_size=None, verbose=0)

In [93]:
y_predict_small

array([[0.44867575],
       [0.68468565],
       [0.45711303],
       [0.7691097 ],
       [0.9617938 ],
       [0.50410706],
       [0.67709327],
       [0.2604741 ],
       [0.32972294],
       [0.81773615],
       [0.4209635 ],
       [0.6498014 ],
       [0.7183083 ],
       [0.664088  ],
       [0.72290903],
       [0.2890113 ],
       [0.6830206 ],
       [0.27220523],
       [0.7227996 ],
       [0.5953881 ],
       [0.68468565],
       [0.29619032],
       [0.62030965],
       [0.7224319 ],
       [0.6261387 ],
       [0.30462673],
       [0.4693328 ],
       [0.6802505 ],
       [0.32444766],
       [0.03274462],
       [0.33386666],
       [0.43028706],
       [0.67709327],
       [0.01346624],
       [0.6916057 ],
       [0.5148759 ],
       [0.22392902],
       [0.32607248],
       [0.6432943 ],
       [0.40073988],
       [0.68262357],
       [0.52356106],
       [0.94577885],
       [0.5201209 ],
       [0.3330629 ],
       [0.68849635],
       [0.49028417],
       [0.279

### 5. test the neural network with a small dataset from sklearn

using the cancer dataset

In [94]:
from sklearn.datasets import load_breast_cancer
res = load_breast_cancer()

In [95]:
X = res['data']
y = res['target']

In [96]:
X_ctr, X_cte, y_ctr, y_cte = train_test_split(
    X, y, test_size=0.3, random_state=152)

In [97]:
model_cancer = Sequential()
model_cancer.add(Dense(units=12, activation='relu', input_dim=30))
model_cancer.add(Dense(units=12, activation='relu'))
model_cancer.add(Dense(units=6, activation='relu'))
model_cancer.add(Dense(units=1, activation='sigmoid'))
model_cancer.compile(loss='mse',
                    optimizer='rmsprop',
                    metrics=['accuracy'])

In [98]:
model_cancer.fit(X_ctr, y_ctr, 
          epochs=20, batch_size=128, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1c391b38>

In [99]:
loss_and_metrics = model_cancer.evaluate(X_cte, y_cte, batch_size=128)
y_predict_cancer = model_cancer.predict(X_cte, batch_size=None, verbose=0)
loss_and_metrics



[0.20131064872992666, 0.8947368375739159]

**It can be seen that the neural network written in keras works fine.**

### 6. test with a small mlp from scikit learn

In [100]:
from sklearn.neural_network import MLPClassifier

In [101]:
mlpc = MLPClassifier(hidden_layer_sizes=(100, 100, 100), alpha=0.001, max_iter=1000, verbose=True, learning_rate_init=0.01)

In [102]:
mlpc.fit(X_ctr, y_ctr)

Iteration 1, loss = 8.42484907
Iteration 2, loss = 13.65406408
Iteration 3, loss = 5.54811978
Iteration 4, loss = 9.37276279
Iteration 5, loss = 8.37660426
Iteration 6, loss = 13.55436156
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100), learning_rate='constant',
       learning_rate_init=0.01, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [103]:
mlpc.score(X_cte, y_cte)

0.29239766081871343

### 7.  test with the data that I have using sklearn MLP

In [104]:
mlpc_1 = MLPClassifier(hidden_layer_sizes=(200, 100, 100, 100),
                       alpha=0.05, max_iter=1000, 
                       verbose=True, learning_rate_init=0.02, tol=1e-4)

In [105]:
from sklearn.utils.validation import column_or_1d
y_train_small_m = np.ravel(y_train_small)
y_test_small_m = np.ravel(y_test_small)
len(y_train_small_m)
y_train_m = np.ravel(y_train)
y_test_m = np.ravel(y_test)

In [106]:
X_train_small_less_1 = X_train_small.iloc[:, 3:13]
X_train_small_less_2 = X_train_small.iloc[:, 16:]
X_train_small_less = pd.concat([X_train_small_less_1, X_train_small_less_2], axis=1)

X_test_small_less_1 = X_test_small.iloc[:, 3:13]
X_test_small_less_2 = X_test_small.iloc[:, 16:]
X_test_small_less = pd.concat([X_test_small_less_1, X_test_small_less_2], axis=1)

In [107]:
X_train_small.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
9089,-19,-19,-19,-19,-19,-19,-19,-19,0,-19,-19,-19,-19,-19,-19,-19,-19,0,0,-1
717,-30,-30,-30,-30,-30,-30,-4,-30,0,-30,-30,-30,-30,-30,-30,-30,-30,0,0,-1
5926,-22,-22,-22,-11,-22,-22,-22,-22,0,-22,-15,-22,-22,-22,-22,-22,-22,0,0,-1
1587,-14,-14,-14,-14,-14,-14,-14,-14,0,-14,-14,-14,-14,-14,-14,-14,-14,0,1,0
2492,-28,-28,-28,-28,-28,-28,-28,-28,0,-28,-28,-28,-28,-28,-28,-28,-28,0,-1,0


In [108]:
mlpc_1.fit(X_train_small, y_train_small_m)

Iteration 1, loss = 5.17459388
Iteration 2, loss = 0.77276659
Iteration 3, loss = 0.77883965
Iteration 4, loss = 0.78696057
Iteration 5, loss = 0.77285415
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200, 100, 100, 100), learning_rate='constant',
       learning_rate_init=0.02, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [109]:
mlpc_1.score(X_test_small, y_test_small_m)

0.4722838137472284

In [110]:
test_111 = mlpc_1.predict(X_test_small)

In [111]:
sum(test_111) / len(test_111)

0.5898004434589801

The accuracy is still very low, so it's not the problem of the neural network

test

## write the input and training

In [165]:
data_dir = os.path.join(os.curdir, 'Data', 'basic_data_pics.pkl')
with open(data_dir, 'rb') as in_file:
    ot = pickle.load(in_file)
data_pics = ot['data']
target_pics = ot['target']
data_pics.shape

(200036, 24)

In [200]:
X_train, X_test, y_train, y_test = train_test_split(
    data_pics, target_pics, test_size=0.1, random_state=152)
X_train.shape
X_train_train, X_vali, y_train_train, y_vali = train_test_split(
    X_train, y_train, test_size=0.3, random_state=152)

index_0 = y_train.index[y_train[0] == 0].tolist()
print(len(index_0))
index_1 = y_train.index[y_train[0] != 0].tolist()
index_1_comparable_to_0 = np.random.choice(index_1, math.floor(len(index_0) * 1))
print(len(index_1_comparable_to_0))
samples = np.concatenate([index_0, index_1_comparable_to_0])
print(len(samples))

small_data = data_pics.iloc[samples, :]
small_target = target_pics.iloc[samples, :]


X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(
    small_data, small_target, test_size=0.3, random_state=152)
X_train_train_small, X_vali_small, y_train_train_small, y_vali_small = train_test_split(
    X_train_small, y_train_small, test_size=0.3, random_state=152)

mlpc_1 = MLPClassifier(hidden_layer_sizes=(50, 20),
                       alpha=0.15, max_iter=1000, batch_size=5000,
                       verbose=True, learning_rate_init=0.01, tol=1e-5,
                       learning_rate='adaptive' )
from sklearn.utils.validation import column_or_1d
y_train_small_m = np.ravel(y_train_small)
y_test_small_m = np.ravel(y_test_small)
len(y_train_small_m)
y_train_m = np.ravel(y_train)
y_test_m = np.ravel(y_test)


X_train_small_less_1 = X_train_small.iloc[:, 3:13]
X_train_small_less_2 = X_train_small.iloc[:, 16:]
X_train_small_less = pd.concat([X_train_small_less_1, X_train_small_less_2], axis=1)

X_test_small_less_1 = X_test_small.iloc[:, 3:13]
X_test_small_less_2 = X_test_small.iloc[:, 16:]
X_test_small_less = pd.concat([X_test_small_less_1, X_test_small_less_2], axis=1)


mlpc_1.fit(X_train_small, y_train_small_m)

mlpc_1.score(X_test_small, y_test_small_m)

15061
15061
30122
Iteration 1, loss = 1.62310334
Iteration 2, loss = 0.81915544
Iteration 3, loss = 0.71384423
Iteration 4, loss = 0.68024405
Iteration 5, loss = 0.66975287
Iteration 6, loss = 0.65317108
Iteration 7, loss = 0.63248675
Iteration 8, loss = 0.61385616
Iteration 9, loss = 0.57677913
Iteration 10, loss = 0.58830787
Iteration 11, loss = 0.56820350
Iteration 12, loss = 0.53298257
Iteration 13, loss = 0.50769968
Iteration 14, loss = 0.49246782
Iteration 15, loss = 0.47991118
Iteration 16, loss = 0.46620214
Iteration 17, loss = 0.45355448
Iteration 18, loss = 0.44869671
Iteration 19, loss = 0.42900878
Iteration 20, loss = 0.41890564
Iteration 21, loss = 0.41205634
Iteration 22, loss = 0.40913743
Iteration 23, loss = 0.39784794
Iteration 24, loss = 0.38503140
Iteration 25, loss = 0.37611784
Iteration 26, loss = 0.36697884
Iteration 27, loss = 0.36234619
Iteration 28, loss = 0.36068061
Iteration 29, loss = 0.35368966
Iteration 30, loss = 0.34553209
Iteration 31, loss = 0.36140740

0.8558149828482904

In [170]:
test_111 = mlpc_1.predict(X_test_small)
test_111[1:100]

array([1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1], dtype=int64)

In [201]:
print(sum(y_test_m) / len(y_test_m))
print(sum(y_test_small_m) / len(y_test_small_m))

0.9136672665466906
0.5047028881265907


In [181]:
mlpc_1.score(X_test, y_test_m)

0.958258348330334

In [183]:
import joblib

In [184]:
joblib.dump(mlpc_1, 'model1.joblib')
mlpc_loaded = joblib.load('model1.joblib') 

In [203]:
from sklearn.svm import SVC
clf_1 = SVC(C=10.0, gamma='auto', verbose=True)
clf_1.fit(X_train_small, y_train_small_m)
clf_1.score(X_test_small, y_test_small_m)

[LibSVM]

0.9399136881708532

In [207]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf_1, X_train_small, y_train_small_m, cv=6)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

array([0.94366999, 0.93767786, 0.9359704 , 0.93312464, 0.93938532,
       0.93881616])

In [206]:
cross_val_score(mlpc_1, X_train_small, y_train_small_m, cv=5)

Iteration 1, loss = 0.88284952
Iteration 2, loss = 0.66759262
Iteration 3, loss = 0.62895208
Iteration 4, loss = 0.61218429
Iteration 5, loss = 0.57720381
Iteration 6, loss = 0.54981722
Iteration 7, loss = 0.51765892
Iteration 8, loss = 0.50013930
Iteration 9, loss = 0.48909796
Iteration 10, loss = 0.47935739
Iteration 11, loss = 0.45658515
Iteration 12, loss = 0.45089568
Iteration 13, loss = 0.45674121
Iteration 14, loss = 0.43945241
Iteration 15, loss = 0.42206926
Iteration 16, loss = 0.41105896
Iteration 17, loss = 0.39946057
Iteration 18, loss = 0.39004794
Iteration 19, loss = 0.37715327
Iteration 20, loss = 0.38680214
Iteration 21, loss = 0.39684353
Iteration 22, loss = 0.38501631
Training loss did not improve more than tol=0.000010 for two consecutive epochs. Stopping.
Iteration 1, loss = 1.64552734
Iteration 2, loss = 0.71926146
Iteration 3, loss = 0.70405667
Iteration 4, loss = 0.58109583
Iteration 5, loss = 0.52792530
Iteration 6, loss = 0.51817925
Iteration 7, loss = 0.476518

array([0.8513161 , 0.88261797, 0.8359023 , 0.85605881, 0.84372777])