# Match_Predictor_DNN_serieA_extended

This script loads the previously trained classifier and performs a prediction.

In [1]:
# Import the necessary libraries.
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import numpy as np
from time import time
import tensorflow as tf
from sklearn.metrics import confusion_matrix
import itertools
from IPython.display import display

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None  # default='warn'



%matplotlib inline

In [2]:
data = pd.read_csv('../Datasets/serie_A/final_dataset_serieA_extended.csv')

N_MATCHES_TO_PREDICT = 10

classifierFilename = './models/football_classifier_serieA_DNN_extended'

## Assign penalty or boosting to the teams scores
This is done to consider the fact that the players might be tired or have a very high motivation

In [3]:
display(data.iloc[-N_MATCHES_TO_PREDICT:, :])

Unnamed: 0.1,Unnamed: 0,AC,AF,AM1,AM2,AM3,AM4,AR,AS,AST,...,HTLossStreak5,ATWinStreak3,ATWinStreak5,ATLossStreak3,ATLossStreak5,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
3168,3168,,,W,W,W,L,,,,...,0,0,0,0,0,-0.307692,0.923077,-0.923077,-0.692308,1
3169,3169,,,L,L,L,L,,,,...,0,0,0,1,0,-1.153846,-1.384615,0.153846,0.230769,-3
3170,3170,,,D,D,W,L,,,,...,0,0,0,0,0,-0.357143,-0.714286,0.5,-0.214286,-6
3171,3171,,,W,D,W,W,,,,...,0,0,0,0,0,-0.642857,1.142857,-1.285714,-0.071429,11
3172,3172,,,D,D,W,L,,,,...,0,0,0,0,0,0.071429,-0.142857,0.071429,0.071429,-1
3173,3173,,,D,W,W,W,,,,...,0,1,0,0,0,-0.571429,1.071429,-1.571429,-0.5,14
3174,3174,,,W,D,W,W,,,,...,0,0,0,0,0,-0.357143,1.785714,-1.571429,-0.214286,13
3175,3175,,,D,L,L,W,,,,...,0,0,0,0,0,1.285714,0.285714,0.857143,0.214286,-5
3176,3176,,,L,W,W,L,,,,...,0,0,0,0,0,1.642857,-0.928571,1.357143,0.214286,-12
3177,3177,,,L,L,L,L,,,,...,0,0,0,1,0,0.071429,-1.928571,1.142857,0.285714,-8


## Preparing the Data

In [4]:
# Remove first 3 matchweeks
data = data[data.MW > 3]

teams = pd.concat([data['Date'], data['HomeTeam'], data['AwayTeam']], axis=1, keys=['Date', 'HomeTeam', 'AwayTeam'])


data.drop(['Unnamed: 0','HomeTeam', 'AwayTeam','Date', 'MW', 'HTFormPtsStr', 'ATFormPtsStr', 'FTHG', 'FTAG',
           'HTGS', 'ATGS', 'HTGC', 'ATGC','HomeTeamLP', 'AwayTeamLP','DiffPts','HTFormPts','ATFormPts',
           'HM4','AM4','HTLossStreak5','ATLossStreak5','HTWinStreak5','ATWinStreak5',
           'HTWinStreak3','HTLossStreak3','ATWinStreak3','ATLossStreak3', 'ENTROPY_B365', 'ENTROPY_BW',
           'ENTROPY_IW', 'ENTROPY_LB', 'ENTROPY_WH', 'ENTROPY_H', 'ENTROPY_D', 'ENTROPY_A',
           'B365H', 'BWH', 'IWH', 'LBH', 'WHH', 'B365D', 'BWD', 'IWD', 'LBD', 'WHD', 'B365A', 'BWA', 'IWA',
           'LBA', 'WHA','PSA', 'PSCA', 'PSCD', 'PSCH', 'PSD', 'PSH', 'SBA', 'SBD', 'SBH', 'SJA', 'SJD',
           'SJH', 'VCA', 'VCD', 'VCH', 'GBA', 'GBD', 'GBH', 'HC', 'HF', 'AC', 'AF', 'BbAv<2.5', 'BbAv>2.5',
           'BbAvA', 'BbAvAHA', 'BbAvAHH', 'BbAvD', 'BbAvH', 'BbMx<2.5', 'BbMx>2.5', 'BbMxA', 'BbMxAHA', 'AY',
           'BbMxAHH', 'BbMxD', 'BbMxH', 'BbOU', 'Div', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'AR', 'AS', 'AST',
           'BSA', 'BSD', 'BSH', 'Bb1X2', 'BbAH', 'BbAHh', 'HTR', 'HY',
           'HomeAttack', 'HomeMedium', 'HomeDefense', 'HomeGK', 'HomeAwayDifference',
           'AwayAttack', 'AwayMedium', 'AwayDefense', 'AwayGK','HomeScore_PrevW1', 'AwayScore_PrevW1'],1, inplace=True)


In [5]:
# Separate into feature set and target variable
X_all_raw = data.drop(['FTR'],1)
y_all_raw = data['FTR']

In [6]:
X_all_raw.HM1 = X_all_raw.HM1.astype('str')
X_all_raw.HM2 = X_all_raw.HM2.astype('str')
X_all_raw.HM3 = X_all_raw.HM3.astype('str')
X_all_raw.AM1 = X_all_raw.AM1.astype('str')
X_all_raw.AM2 = X_all_raw.AM2.astype('str')
X_all_raw.AM3 = X_all_raw.AM3.astype('str')

def preprocess_features(X):
    ''' Preprocesses the football data and converts catagorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

def preprocess_labels(labels):
    labels =  pd.get_dummies(labels) # A [1, 0, 0] || D [0, 1, 0] || H [0, 0, 1]
    return labels


X_all_raw = preprocess_features(X_all_raw)
y_all_raw = preprocess_labels(y_all_raw)
print( "Processed feature columns ({} total features):\n{}".format(len(X_all_raw.columns), list(X_all_raw.columns)) )

Processed feature columns (26 total features):
['AM1_D', 'AM1_L', 'AM1_W', 'AM2_D', 'AM2_L', 'AM2_W', 'AM3_D', 'AM3_L', 'AM3_W', 'ATP', 'AwayScores', 'HM1_D', 'HM1_L', 'HM1_W', 'HM2_D', 'HM2_L', 'HM2_W', 'HM3_D', 'HM3_L', 'HM3_W', 'HTP', 'HomeScores', 'HTGD', 'ATGD', 'DiffFormPts', 'DiffLP']


In [7]:
# Standardising the data.
from sklearn.preprocessing import scale

cols = X_all_raw.keys().tolist()
#cols = ['HTGD','ATGD','HTP','ATP','DiffLP', 'HomeAttack', 'HomeMedium', 'HomeDefense', 'HomeGK', 'AwayAttack',
#       'AwayMedium', 'AwayDefense', 'HomeScores', 'AwayScores', 'HomeAwayDifference']
for col in cols:
    X_all_raw[col] = scale(X_all_raw[col])/max(X_all_raw[col])
    #X_all_raw[col] = X_all_raw[col]/max(X_all_raw[col])

In [8]:
# Show the feature information by printing the first five rows
print( "\nFeature values:" )
display(X_all_raw.head())

print( "\nLabels values:" )
display(y_all_raw.head())


Feature values:


Unnamed: 0,AM1_D,AM1_L,AM1_W,AM2_D,AM2_L,AM2_W,AM3_D,AM3_L,AM3_W,ATP,...,HM2_W,HM3_D,HM3_L,HM3_W,HTP,HomeScores,HTGD,ATGD,DiffFormPts,DiffLP
30,1.66632,-0.671638,-0.858578,-0.586351,1.182937,-0.697644,1.66632,-0.709661,-0.816965,-0.599703,...,1.195578,-0.581586,-0.825771,1.387643,0.760241,0.020919,0.776879,-0.294557,2.167974,-0.072602
31,-0.600125,-0.671638,1.164716,-0.586351,1.182937,-0.697644,1.66632,-0.709661,-0.816965,-0.215978,...,1.195578,-0.581586,1.21099,-0.720646,-0.400014,0.004662,-0.303215,-0.005713,-0.289448,-0.050237
32,-0.600125,1.488897,-0.858578,-0.586351,1.182937,-0.697644,-0.600125,1.409123,-0.816965,-0.983427,...,-0.836415,1.719437,-0.825771,-0.720646,-0.400014,0.011629,0.005384,-0.438979,0.939263,-0.044645
33,-0.600125,-0.671638,1.164716,-0.586351,-0.845354,1.433395,1.66632,-0.709661,-0.816965,0.359608,...,-0.836415,1.719437,-0.825771,-0.720646,-0.786766,-0.00997,-0.457514,0.860818,-1.825337,0.083958
34,-0.600125,1.488897,-0.858578,-0.586351,-0.845354,1.433395,-0.600125,-0.709661,1.224043,0.167746,...,-0.836415,-0.581586,1.21099,-0.720646,-0.980142,0.001642,-0.611813,-0.005713,-1.825337,0.005678



Labels values:


Unnamed: 0,A,D,H
30,0,0,1
31,0,1,0
32,0,0,1
33,1,0,0
34,0,1,0


# Separate training from test (i.e. to be predicted) data

In [9]:
X_test = X_all_raw.iloc[-N_MATCHES_TO_PREDICT:, :]
y_test = y_all_raw[-N_MATCHES_TO_PREDICT:]
ntest, _  = X_test.shape

print("Matches to predict: last %d"%( ntest))

test_dataset  = X_test.values.tolist()
test_labels   = y_test.values.tolist()

Matches to predict: last 10


## Aux functions

In [10]:
def decode_probabilities(y):
    labels = []
    for pred in y:
        max_val = max(pred)
        if pred[0] == max_val:
            labels.append('A')
        elif pred[1] == max_val:
            labels.append('D')
        elif pred[2] == max_val:
            labels.append('H')
    return labels

# Load the Deep Neural Network

In [11]:
y_pred = []

with tf.Session() as session:
    saver = tf.train.import_meta_graph(classifierFilename+'.meta')
    saver.restore(session, classifierFilename)

    tf.get_default_graph().as_graph_def()
    
    tf_test_dataset = session.graph.get_tensor_by_name("predict_dataset:0")
    test_prediction = session.graph.get_tensor_by_name("predict:0")

    for y in test_dataset:
        feed_dict = {tf_test_dataset : [y]}
        y_pred.append( list( session.run( [test_prediction], feed_dict=feed_dict)[0][0] ) )

INFO:tensorflow:Restoring parameters from ./models/football_classifier_serieA_DNN_extended


In [12]:
print(y_pred)

[[0.29689962, 0.703098, 2.4089636e-06], [0.22080231, 0.77910131, 9.6419804e-05], [0.10033247, 0.50589132, 0.39377624], [0.53648412, 0.46351588, 7.0491678e-27], [0.3507109, 0.64928907, 1.8599557e-10], [0.75150144, 0.24849857, 0.0], [0.99884427, 0.0011556894, 0.0], [0.0, 0.023091029, 0.97690898], [1.4719096e-21, 0.096339695, 0.9036603], [0.31105107, 0.68270278, 0.006246204]]


In [13]:
teams_pred  = teams.iloc[-N_MATCHES_TO_PREDICT:, :]
teams_pred.index=teams_pred.index.tolist()

Pred_prob_A = []
Pred_prob_D = []
Pred_prob_H = []
for y in y_pred:
    Pred_prob_A.append(np.round(y[0], 3))
    Pred_prob_D.append(np.round(y[1], 3))
    Pred_prob_H.append(np.round(y[2], 3))

teams_pred['Predictions'] = decode_probabilities(y_pred)
teams_pred['Pred_prob_A'] = Pred_prob_A
teams_pred['Pred_prob_D'] = Pred_prob_D
teams_pred['Pred_prob_H'] = Pred_prob_H

teams_pred['Exact_res'] = decode_probabilities(test_labels)

In [14]:
display(teams_pred[['Date', 'HomeTeam','AwayTeam', 'Predictions', 'Pred_prob_H', 'Pred_prob_D', 'Pred_prob_A']])

Unnamed: 0,Date,HomeTeam,AwayTeam,Predictions,Pred_prob_H,Pred_prob_D,Pred_prob_A
3168,2017-11-25,Bologna,Sampdoria,D,0.0,0.703,0.297
3169,2017-11-25,Sassuolo,Verona,D,0.0,0.779,0.221
3170,2017-11-25,Chievo,Spal,D,0.394,0.506,0.1
3171,2017-11-25,Cagliari,Inter,A,0.0,0.464,0.536
3172,2017-11-26,Milan,Torino,D,0.0,0.649,0.351
3173,2017-11-26,Genoa,Roma,A,0.0,0.248,0.752
3174,2017-11-26,Udinese,Napoli,A,0.0,0.001,0.999
3175,2017-11-26,Lazio,Fiorentina,H,0.977,0.023,0.0
3176,2017-11-26,Juventus,Crotone,H,0.904,0.096,0.0
3177,2017-11-26,Atalanta,Benevento,D,0.006,0.683,0.311
