The Goal here (pun intended) is to design a prediction system which can accurately predict if the home team will win or not. We will use the final dataset got by our earlier "Scraping and Cleaning" Notebook build our prediction model on.

In [1]:
# Import the necessary libraries.
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import numpy as np
from time import time
import tensorflow as tf
from sklearn.metrics import confusion_matrix
import itertools
from IPython.display import display

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None  # default='warn'



%matplotlib inline

In [2]:
data = pd.read_csv('../Datasets/serie_A/final_dataset_serieA.csv')

N_MATCHES_TO_PREDICT = 10

classifierFilename = 'football_classifier_serieA_DNN_b'

## Preparing the Data

In [3]:
# Remove first 3 matchweeks
data = data[data.MW > 3]

data_pred = data[-N_MATCHES_TO_PREDICT:]

teams = pd.concat([data['Date'], data['HomeTeam'], data['AwayTeam']], axis=1, keys=['Date', 'HomeTeam', 'AwayTeam'])

data.drop(['Unnamed: 0','HomeTeam', 'AwayTeam','Date', 'MW', 'HTFormPtsStr', 'ATFormPtsStr', 'FTHG', 'FTAG',
           'HTGS', 'ATGS', 'HTGC', 'ATGC','HomeTeamLP', 'AwayTeamLP','DiffPts','HTFormPts','ATFormPts',
           'HM4','AM4','HTLossStreak5','ATLossStreak5','HTWinStreak5','ATWinStreak5',
           'HTWinStreak3','HTLossStreak3','ATWinStreak3','ATLossStreak3', 'ENTROPY_B365', 'ENTROPY_BW',
           'ENTROPY_IW', 'ENTROPY_LB', 'ENTROPY_WH', 'ENTROPY_H', 'ENTROPY_D', 'ENTROPY_A',
           'B365H', 'BWH', 'IWH', 'LBH', 'WHH', 'B365D', 'BWD', 'IWD', 'LBD', 'WHD', 'B365A', 'BWA', 'IWA',
            'LBA', 'WHA'],1, inplace=True)

data_pred.drop(['Unnamed: 0'],1, inplace=True)
# ,'HomeTeam', 'AwayTeam'
# Preview data.
#display(data)


In [4]:
# Separate into feature set and target variable
X_all_raw = data.drop(['FTR'],1)
y_all_raw = data['FTR']

# Standardising the data.
from sklearn.preprocessing import scale


cols = [['HTGD','ATGD','HTP','ATP','DiffLP']]
for col in cols:
    X_all_raw[col] = scale(X_all_raw[col])

In [5]:
X_all_raw.HM1 = X_all_raw.HM1.astype('str')
X_all_raw.HM2 = X_all_raw.HM2.astype('str')
X_all_raw.HM3 = X_all_raw.HM3.astype('str')
X_all_raw.AM1 = X_all_raw.AM1.astype('str')
X_all_raw.AM2 = X_all_raw.AM2.astype('str')
X_all_raw.AM3 = X_all_raw.AM3.astype('str')

def preprocess_features(X):
    ''' Preprocesses the football data and converts catagorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

def preprocess_labels(labels):
    labels =  pd.get_dummies(labels) # A [1, 0, 0] || D [0, 1, 0] || H [0, 0, 1]
    return labels


X_all_raw = preprocess_features(X_all_raw)
y_all_raw = preprocess_labels(y_all_raw)
print( "Processed feature columns ({} total features):\n{}".format(len(X_all_raw.columns), list(X_all_raw.columns)) )

Processed feature columns (24 total features):
['HTP', 'ATP', 'HM1_D', 'HM1_L', 'HM1_W', 'HM2_D', 'HM2_L', 'HM2_W', 'HM3_D', 'HM3_L', 'HM3_W', 'AM1_D', 'AM1_L', 'AM1_W', 'AM2_D', 'AM2_L', 'AM2_W', 'AM3_D', 'AM3_L', 'AM3_W', 'HTGD', 'ATGD', 'DiffFormPts', 'DiffLP']


In [6]:
# Show the feature information by printing the first five rows
print( "\nFeature values:" )
display(X_all_raw.head())

print( "\nLabels values:" )
display(y_all_raw.head())


Feature values:


Unnamed: 0,HTP,ATP,HM1_D,HM1_L,HM1_W,HM2_D,HM2_L,HM2_W,HM3_D,HM3_L,...,AM2_D,AM2_L,AM2_W,AM3_D,AM3_L,AM3_W,HTGD,ATGD,DiffFormPts,DiffLP
30,-0.554517,0.999379,0,1,0,1,0,0,0,0,...,1,0,0,0,0,1,-0.384897,1.178491,-0.75,0.527008
31,-0.554517,2.056671,1,0,0,0,0,1,0,1,...,0,0,1,0,0,1,-0.384897,1.576807,-1.25,0.316194
32,-2.13524,-1.643853,0,1,0,1,0,0,0,1,...,1,0,0,0,1,0,-1.180857,-0.414775,-0.25,-0.210841
33,-0.02761,2.056671,1,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0.411064,1.576807,-1.0,0.316194
34,1.026205,-2.172499,0,0,1,1,0,0,0,0,...,1,0,0,0,1,0,0.809044,-0.813092,1.5,-0.316248



Labels values:


Unnamed: 0,A,D,H
30,0,1,0
31,0,1,0
32,0,1,0
33,0,0,1
34,0,0,1


# Separate training from test (i.e. to be predicted) data

In [7]:
X_test = X_all_raw.iloc[-N_MATCHES_TO_PREDICT:, :]
y_test = y_all_raw[-N_MATCHES_TO_PREDICT:]
ntest, _  = X_test.shape

print("Matches to predict: last %d"%( ntest))

test_dataset  = X_test.values.tolist()
test_labels   = y_test.values.tolist()

Matches to predict: last 10


## Aux functions

In [8]:
def decode_probabilities(y):
    labels = []
    for pred in y:
        max_val = max(pred)
        if pred[0] == max_val:
            labels.append('A')
        elif pred[1] == max_val:
            labels.append('D')
        elif pred[2] == max_val:
            labels.append('H')
    return labels

# Load the Deep Neural Network

In [9]:
y_pred = []

with tf.Session() as session:
    saver = tf.train.import_meta_graph(classifierFilename+'.meta')
    saver.restore(session, classifierFilename)

    tf.get_default_graph().as_graph_def()
    
    tf_test_dataset = session.graph.get_tensor_by_name("predict_dataset:0")
    test_prediction = session.graph.get_tensor_by_name("predict:0")

    for y in test_dataset:
        feed_dict = {tf_test_dataset : [y]}
        y_pred.append( list( session.run( [test_prediction], feed_dict=feed_dict)[0][0] ) )

INFO:tensorflow:Restoring parameters from football_classifier_serieA_DNN_b


In [10]:
print(y_pred)

[[0.011376034, 0.54730374, 0.44132015], [0.80121994, 0.049815413, 0.14896464], [0.0039909012, 0.046243183, 0.94976592], [0.79608691, 0.012723695, 0.19118942], [0.23348156, 0.76485217, 0.0016662397], [0.53082842, 0.11170167, 0.35746998], [6.2796704e-07, 0.92763019, 0.072369106], [1.7453565e-05, 0.001137447, 0.9988451], [0.12612683, 0.0091325399, 0.86474061], [0.1259529, 0.72790992, 0.14613721]]


In [11]:
teams_pred  = teams.iloc[-N_MATCHES_TO_PREDICT:, :]
teams_pred.index=teams_pred.index.tolist()

Pred_prob_A = []
Pred_prob_D = []
Pred_prob_H = []
for y in y_pred:
    Pred_prob_A.append(np.round(y[0], 3))
    Pred_prob_D.append(np.round(y[1], 3))
    Pred_prob_H.append(np.round(y[2], 3))

teams_pred['Predictions'] = decode_probabilities(y_pred)
teams_pred['Pred_prob_A'] = Pred_prob_A
teams_pred['Pred_prob_D'] = Pred_prob_D
teams_pred['Pred_prob_H'] = Pred_prob_H

teams_pred['Exact_res'] = decode_probabilities(test_labels)

In [12]:
display(teams_pred[['Date', 'HomeTeam','AwayTeam', 'Predictions', 'Pred_prob_H', 'Pred_prob_D', 'Pred_prob_A']])

Unnamed: 0,Date,HomeTeam,AwayTeam,Predictions,Pred_prob_H,Pred_prob_D,Pred_prob_A
4289,2017-11-04,Bologna,Crotone,D,0.441,0.547,0.011
4290,2017-11-04,Genoa,Sampdoria,A,0.149,0.05,0.801
4291,2017-11-05,Inter,Torino,H,0.95,0.046,0.004
4292,2017-11-05,Cagliari,Verona,A,0.191,0.013,0.796
4293,2017-11-05,Chievo,Napoli,D,0.002,0.765,0.233
4294,2017-11-05,Fiorentina,Roma,A,0.357,0.112,0.531
4295,2017-11-05,Juventus,Benevento,D,0.072,0.928,0.0
4296,2017-11-05,Lazio,Udinese,H,0.999,0.001,0.0
4297,2017-11-05,Atalanta,Spal,H,0.865,0.009,0.126
4298,2017-11-05,Sassuolo,Milan,D,0.146,0.728,0.126
