### 1. Import all the packages

In [520]:
import numpy as np
import math
import pandas as pd
import pickle, os, math
from keras.models import Sequential
from keras.callbacks import Callback
from keras.layers import Dense
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

### 2.1 Import the data first

There has been a huge mistake in the data generation. The mistake lies in the swap of x and y positions of the fighter. With the original data the accuracy stays constant around 50% in the balanced classes dataset, which shows that the data is not good enough.

In [450]:
data_dir = os.path.join(os.curdir, 'Data', 'basic_data_short.pkl')
with open(data_dir, 'rb') as in_file:
    ot = pickle.load(in_file)
data = ot['data']
target = ot['target']
data.shape

(200036, 17)

In [451]:
sums = [None for i in range(data.shape[1])]
for i in range(data.shape[1]):
    sums[i] = np.sum(data.iloc[:, i])

with help to [Thong Nguyen](https://medium.com/@thongonary/how-to-compute-f1-score-for-each-epoch-in-keras-a1acd17715a2)

In [452]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: %f — val_precision: %f — val_recall %f" % (_val_f1, _val_precision, _val_recall))
        return

metrics_usrdefined = Metrics()

### 2.2 Preprocess data

The first raw data is unbalanced with classes

In [453]:
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.1, random_state=152)
X_train.shape
X_train_train, X_vali, y_train_train, y_vali = train_test_split(
    X_train, y_train, test_size=0.3, random_state=152)

Create balanced classes sample data

In [531]:
index_0 = y_train.index[y_train[0] == 0].tolist()
print(len(index_0))
index_1 = y_train.index[y_train[0] != 0].tolist()
index_1_comparable_to_0 = np.random.choice(index_1, math.floor(len(index_0) * 1))
print(len(index_1_comparable_to_0))
samples = np.concatenate([index_0, index_1_comparable_to_0])
print(len(samples))

13746
13746
27492


In [532]:
small_data = data.iloc[samples, :]
small_target = target.iloc[samples, :]

In [533]:
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(
    small_data, small_target, test_size=0.3, random_state=152)
X_train_train_small, X_vali_small, y_train_train_small, y_vali_small = train_test_split(
    X_train_small, y_train_small, test_size=0.3, random_state=152)

### 3. Try running neural network first with raw data

In [457]:
model = Sequential()
model.add(Dense(units=12, activation='relu', input_dim=17))
model.add(Dense(units=6, activation='relu'))
model.add(Dense(units=1, activation='linear'))
model.compile(loss='mean_squared_error',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [458]:
model.fit(X_train_train, y_train_train,
          validation_data=(X_vali, y_vali), 
          epochs=10, batch_size=256, verbose=1)

Train on 126022 samples, validate on 54010 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x3426b978>

In [459]:
loss_and_metrics = model.evaluate(X_test, y_test, batch_size=128)
print(loss_and_metrics)

[0.06779023905863311, 0.9247150570005208]


### The result is 98.4 accurate???

It is a false hope. The model predicts everything to be one, and one takes up most of the data, so the result is really bad.

In [460]:
y_predict = model.predict(X_test, batch_size=None, verbose=0)
np.sum(y_predict) / len(y_predict)

0.944332813124875

### 4. Dealing with imbalanced classes

try to make the balanced classes

In [466]:
model_small = Sequential()
model_small.add(Dense(units=12, activation='relu', input_dim=17))
model_small.add(Dense(units=12, activation='relu'))
model_small.add(Dense(units=6, activation='relu'))
model_small.add(Dense(units=1, activation='sigmoid'))
model_small.compile(loss='mse',
                    optimizer='rmsprop',
                    metrics=['accuracy'])

In [472]:
model_small.fit(X_train_train_small, y_train_train_small,
          validation_data=(X_vali_small, y_vali_small), 
          epochs=30, batch_size=256, verbose=1)

Train on 13470 samples, validate on 5774 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x38f99278>

In [468]:
loss_and_metrics_small = model_small.evaluate(X_test_small, y_test_small, batch_size=128)
loss_and_metrics_small



[0.23251390897973317, 0.6043889427161934]

In [469]:
y_predict_small = model_small.predict(X_test_small, batch_size=None, verbose=0)

In [470]:
y_predict_small

array([[0.55761814],
       [0.49108914],
       [0.6186435 ],
       ...,
       [0.24693784],
       [0.5278685 ],
       [0.5298917 ]], dtype=float32)

### 5. test the neural network with a small dataset from sklearn

using the cancer dataset

In [333]:
from sklearn.datasets import load_breast_cancer
res = load_breast_cancer()

In [335]:
X = res['data']
y = res['target']

In [336]:
X_ctr, X_cte, y_ctr, y_cte = train_test_split(
    X, y, test_size=0.3, random_state=152)

In [397]:
model_cancer = Sequential()
model_cancer.add(Dense(units=12, activation='relu', input_dim=30))
model_cancer.add(Dense(units=12, activation='relu'))
model_cancer.add(Dense(units=6, activation='relu'))
model_cancer.add(Dense(units=1, activation='sigmoid'))
model_cancer.compile(loss='mse',
                    optimizer='rmsprop',
                    metrics=['accuracy'])

In [398]:
model_cancer.fit(X_ctr, y_ctr, 
          epochs=50, batch_size=256, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x33f47208>

In [399]:
loss_and_metrics = model_cancer.evaluate(X_cte, y_cte, batch_size=128)
y_predict_cancer = model_cancer.predict(X_cte, batch_size=None, verbose=0)
loss_and_metrics



[0.05982166929551733, 0.9298245624492043]

**It can be seen that the neural network written in keras works fine.**

### 6. test with a small mlp from scikit learn

In [345]:
from sklearn.neural_network import MLPClassifier

In [500]:
mlpc = MLPClassifier(hidden_layer_sizes=(100, 100, 100), alpha=0.001, max_iter=1000, verbose=True, learning_rate_init=0.01)

In [501]:
mlpc.fit(X_ctr, y_ctr)

Iteration 1, loss = 11.14467975
Iteration 2, loss = 13.65405190
Iteration 3, loss = 9.37279862
Iteration 4, loss = 10.56695409
Iteration 5, loss = 12.32709532
Iteration 6, loss = 4.00297492
Iteration 7, loss = 7.70804793
Iteration 8, loss = 2.49859031
Iteration 9, loss = 2.23042657
Iteration 10, loss = 1.20085262
Iteration 11, loss = 0.60518680
Iteration 12, loss = 0.53854086
Iteration 13, loss = 0.51404514
Iteration 14, loss = 0.41708459
Iteration 15, loss = 0.35831535
Iteration 16, loss = 0.35552345
Iteration 17, loss = 0.35761314
Iteration 18, loss = 0.31602525
Iteration 19, loss = 0.31249138
Iteration 20, loss = 0.28199130
Iteration 21, loss = 0.24483918
Iteration 22, loss = 0.23275928
Iteration 23, loss = 0.20916527
Iteration 24, loss = 0.21796147
Iteration 25, loss = 0.21152831
Iteration 26, loss = 0.24042567
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100), learning_rate='constant',
       learning_rate_init=0.01, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [502]:
mlpc.score(X_cte, y_cte)

0.9239766081871345

### 7.  test with the data that I have using sklearn MLP

In [595]:
mlpc_1 = MLPClassifier(hidden_layer_sizes=(200, 100, 100, 100),
                       alpha=0.05, max_iter=1000, 
                       verbose=True, learning_rate_init=0.02, tol=1e-4)

In [596]:
from sklearn.utils.validation import column_or_1d
y_train_small_m = np.ravel(y_train_small)
y_test_small_m = np.ravel(y_test_small)
len(y_train_small_m)
y_train_m = np.ravel(y_train)
y_test_m = np.ravel(y_test)

In [597]:
X_train_small_less_1 = X_train_small.iloc[:, 3:13]
X_train_small_less_2 = X_train_small.iloc[:, 16:]
X_train_small_less = pd.concat([X_train_small_less_1, X_train_small_less_2], axis=1)

X_test_small_less_1 = X_test_small.iloc[:, 3:13]
X_test_small_less_2 = X_test_small.iloc[:, 16:]
X_test_small_less = pd.concat([X_test_small_less_1, X_test_small_less_2], axis=1)

In [598]:
X_train_small.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
130421,-22,-22,-22,-22,-22,-22,-22,-22,-22,-22,-22,-22,-22,-22,0,0,-1
101101,-14,-14,-14,-14,-14,-14,-14,-14,-14,-14,-14,-14,-14,-14,0,0,0
4388,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,0,0,-1
160831,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,0,0,-1
31436,-16,-16,-16,-16,-16,-16,-16,-16,-16,-16,0,-16,-16,-16,0,0,0


In [599]:
mlpc_1.fit(X_train_small, y_train_small_m)

Iteration 1, loss = 0.90895367
Iteration 2, loss = 0.73532659
Iteration 3, loss = 0.72164651
Iteration 4, loss = 0.69735924
Iteration 5, loss = 0.69112344
Iteration 6, loss = 0.68176014
Iteration 7, loss = 0.68027649
Iteration 8, loss = 0.67842622
Iteration 9, loss = 0.67473790
Iteration 10, loss = 0.67555945
Iteration 11, loss = 0.67022124
Iteration 12, loss = 0.67161136
Iteration 13, loss = 0.67219869
Iteration 14, loss = 0.67895905
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200, 100, 100, 100), learning_rate='constant',
       learning_rate_init=0.02, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [600]:
mlpc_1.score(X_test_small, y_test_small_m)

0.5982056256062076

In [493]:
test_111 = mlpc_1.predict(X_test_small)

In [494]:
sum(test_111) / len(test_111)

0.6064500484966052

The accuracy is still very low, so it's not the problem of the neural network

test