# Neural network

### Try to use our own preprocessing

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import sqlite3
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from time import time
from sklearn.decomposition import PCA
import warnings
from pathlib import Path

from utils.data_processing import create_feables
from utils.data_processing import confusion_matrix

warnings.simplefilter("ignore")

# You should run the ../../data_preprocessing.ipynb notebook to generate the data before running this
data = pd.read_csv("../../datasets/data.csv")

labels = data.loc[:,'label']
features = data.drop('label', axis = 1)

In [2]:
from sklearn.model_selection import train_test_split


#Splitting the data into train, test
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.25, random_state = 0, stratify = labels)

# Show some rows of the data
X_train.head()

Unnamed: 0.1,Unnamed: 0,season,month,stage,home_p_1_overall_rating,away_p_1_overall_rating,home_p_2_overall_rating,away_p_2_overall_rating,home_p_3_overall_rating,away_p_3_overall_rating,...,home_win_percentage,away_defencePressure,away_defenceAggression,away_win_percentage,B365H,B365D,B365A,BWH,BWD,BWA
6688,7106,2013,3,28,81.0,77.0,75.0,66.0,73.0,74.0,...,0.213816,70.0,70.0,0.334586,2.0,3.2,4.0,1.91,3.2,4.33
17359,18398,2011,10,10,72.0,74.0,76.0,69.0,74.0,69.0,...,0.412281,65.0,30.0,0.447368,2.05,3.2,3.75,2.0,3.3,3.75
9277,9829,2014,2,19,62.0,74.0,62.0,73.0,65.0,73.0,...,0.421053,60.0,70.0,0.118421,8.5,6.0,1.29,8.5,4.75,1.3
6151,6260,2013,12,16,79.0,72.0,76.0,70.0,72.0,77.0,...,0.172794,30.0,30.0,0.352941,1.91,3.75,3.6,1.9,3.8,3.25
18371,19619,2012,11,14,66.0,64.0,57.0,59.0,62.0,61.0,...,0.348684,60.0,70.0,0.467105,1.83,3.5,4.33,1.72,3.5,4.33


In [3]:
# Match outcome
y_train.head()

# Number of possible outcomes
# 1 = win
# 0 = draw
# -1 = lose
print('Number of possible outcomes', np.unique(y_train.values).shape[0])


Number of possible outcomes 3


In [4]:
# Get number of columns by tacking the number of columns in the X_train
columns = X_train.shape[1]
print('Number of columns:', columns)

Number of columns: 38


In [5]:
from keras.models import Model, Sequential
from keras import regularizers
from keras.layers import Input, Activation, Flatten, Dense, Concatenate, Dropout
from keras.optimizers import Adam
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import keras
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping

def build_model():
    model = Sequential()
    model.add(Dense(32, input_dim=columns, activation='relu', kernel_regularizer=regularizers.l2(0.0002)))
    model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.0002)))
    model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.0002)))
    model.add(Dense(3, activation='softmax', kernel_regularizer=regularizers.l2(0.0002)))
    model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.0001), metrics=['accuracy']) # categorical_crossentropy
    return model

model = build_model()
model.summary()

Using TensorFlow backend.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                1248      
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 27        
Total params: 1,939
Trainable params: 1,939
Non-trainable params: 0
_________________________________________________________________


In [None]:
from keras.utils.np_utils import to_categorical

categorical = to_categorical(y_train, num_classes=3)
history = model.fit(X_train, categorical, batch_size=32, epochs=100, validation_split=0.1)

Train on 13223 samples, validate on 1470 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
 1888/13223 [===>..........................] - ETA: 0s - loss: 0.1986 - accuracy: 0.5101

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.legend()
plt.show()

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='validation accuracy')
plt.legend()
plt.show()

In [None]:
# Take the most probable prediction
predictions = np.argmax(model.predict(X_test), axis=1)
# Calculate accuracy
accuracy = (predictions == y_test).mean()
print("Test set accuracy:", accuracy)

# Let us plot the confusion matrix

In [None]:
# from utils.data_processing import build_confusion_matrix

# TODO: Remove this and use the commented import instead
def build_confusion_matrix(y_true, y_pred):
    return pd.DataFrame(confusion_matrix(y_true, y_pred, labels=[2, 1, 0]),
                        index=['Home wins (true)', 'Draw (true)', 'Away wins (true)'],
                        columns=['Home wins (pred)', 'Draw (pred)', 'Away wins (pred)'])


print("Neural network confusion matrix: \n", build_confusion_matrix(y_test, predictions))

# Comparing Neural network to Random forest

In [None]:
#Splitting the data into Train, Calibrate, and Test data sets
X_train_calibrate, X_test, y_train_calibrate, y_test = train_test_split(features, labels, test_size = 0.25, random_state = 0, stratify = labels)
X_train, X_calibrate, y_train, y_calibrate = train_test_split(X_train_calibrate, y_train_calibrate, test_size = 0.25, random_state = 0, stratify = y_train_calibrate)


In [None]:
from utils.data_processing import train_calibrate_predict

# Creating cross validation data splits
cv_sets = model_selection.StratifiedShuffleSplit(n_splits = 5, test_size = 0.20, random_state = 5)
cv_sets.get_n_splits(X_train, y_train)

# Init Random Forest
RF_clf = RandomForestClassifier(n_estimators = 200, random_state = 1, class_weight = 'balanced')

#Specficying scorer and parameters for grid search
feature_len = features.shape[1]
scorer = make_scorer(accuracy_score)
parameters_RF = {'clf__max_features': ['auto', 'log2'], 'dm_reduce__n_components': np.arange(5, feature_len, int(np.around(feature_len/5)))}

#Initializing dimensionality reductions
pca = PCA()
RF_clf.fit(X_train, y_train)

print("Random forest accuracy for train set.".format(RF_clf.__class__.__name__, accuracy_score(y_train, RF_clf.predict(X_train))))
print("Random forest accuracy for test set".format(RF_clf.__class__.__name__, accuracy_score(y_test, RF_clf.predict(X_test))))

#Grid search, calibrate, and test the classifier
calibrated_RF_clf, dm_reduce, train_score, test_score = train_calibrate_predict(clf = RF_clf, dm_reduction = pca, X_train = X_train, y_train = y_train,
                                                                  X_calibrate = X_calibrate, y_calibrate = y_calibrate,
                                                                  X_test = X_test, y_test = y_test, cv_sets = cv_sets,
                                                                  params = parameters_RF, scorer = scorer, jobs = 1, use_grid_search = True)

In [None]:
rf_predictions = RF_clf.predict(X_test)

# Calculate accuracy
accuracy = (rf_predictions == y_test).mean()
print("Random forest test set accuracy:", accuracy)

In [None]:
print("Random forest confusion matrix: \n", build_confusion_matrix(y_test, rf_predictions))

# Conclusion