In [381]:
import pandas as pd                 # pandas is a dataframe library
import matplotlib.pyplot as plt      # matplotlib.pyplot plots data

%matplotlib inline
df =pd.read_csv('DataModded2.csv', encoding="utf-8-sig")

In [382]:
df.shape

(380, 24)

In [383]:
df.head(5)


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HF,AF,HC,AC,HY,AY,HR,AR,HW,AW
0,10/08/2018,Man United,Leicester,2,1,H,1,0,H,A Marriner,...,11,8,2,5,2,1,0,0,697.54,244.94
1,11/08/2018,Bournemouth,Cardiff,2,0,H,1,0,H,K Friend,...,11,9,7,4,1,1,0,0,130.56,52.4
2,11/08/2018,Fulham,Crystal Palace,0,2,A,0,1,A,M Dean,...,9,11,5,5,1,2,0,0,148.32,201.61
3,11/08/2018,Huddersfield,Chelsea,0,3,A,0,2,A,C Kavanagh,...,9,8,2,5,2,1,0,0,97.69,597.26
4,11/08/2018,Newcastle,Tottenham,1,2,A,1,2,A,M Atkinson,...,11,12,3,5,2,2,0,0,119.9,348.77


In [384]:
df.tail()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HF,AF,HC,AC,HY,AY,HR,AR,HW,AW
375,12/05/2019,Liverpool,Wolves,2,0,H,1,0,H,M Atkinson,...,3,11,4,1,0,2,0,0,625.66,111.91
376,12/05/2019,Man United,Cardiff,0,2,A,0,1,A,J Moss,...,9,6,11,2,3,3,0,0,697.54,52.4
377,12/05/2019,Southampton,Huddersfield,1,1,D,1,0,H,L Probert,...,8,6,4,3,0,1,0,0,222.92,97.69
378,12/05/2019,Tottenham,Everton,2,2,D,1,0,H,A Marriner,...,10,13,7,4,0,2,0,0,348.77,341.67
379,12/05/2019,Watford,West Ham,1,4,A,0,2,A,C Kavanagh,...,10,10,7,2,1,0,1,0,149.21,213.15


### Definition of features
From the metadata on the data source we have the following definition of the features.

| Feature  | Description | Comments |
|--------------|-------------|--------|
|HS | Home Shots|
|AS | Away Shots|
| FTHG     | Full Time HomeTeam Goal         |
| FTAG | Full Time AwayTeam Goal       |
| FTR | Full Time Result | It's what we want to predict |
| HTHG | Half Time HomeTeam Goal|
| HTAG | Half Time AwayTeam Goal |
| HST |  HomeTeam Shots on Tiger |
| AST |  AwayTeam Shots on Tiger |
| HF | HomeTeam Fouls|
| AF | AwayTeam Fouls | 
| HC | HomeTeam Corners |  
| AC | AwayTeam Corners|
| HY | HomeTeam YellowCards|
| AY | AwayTeam YellowCards|
| HR | HomeTeam RedCards|
| AR | AwayTeam RedCards|
| HW | HomeTeam Worth | price in millions
| AW | AwayTeam Worth | price in millions
| HR | HomeTeam Ratio |  value described in %
| AR | AwayTeam Ratio | value described in %


In [385]:
df.isnull().values.any()
 

False

## Data standardization


In [386]:
from sklearn import preprocessing

def mean(results):
    return sum(results) / len(results)
def Variance(results, mean):
    return sum((xi - mean) ** 2 for xi in results) / len(results)

def standardize(column):
    m = mean(column)
    var = Variance(column, m)
    col = []
    for i in column:
        col.append((i-m)/(var)**(1/2.0))
    return col
    
df[['AS']] = standardize(df['AS'])
df[['HS']] = standardize(df['HS'])
df[['FTHG']] = standardize(df['FTHG'])
df[['FTAG']] = standardize(df['FTAG'])
df[['HTHG']] = standardize(df['HTHG'])
df[['HTAG']] = standardize(df['HTAG'])
df[['HST']] = standardize(df['HST'])
df[['AST']] = standardize(df['AST'])
df[['HF']] = standardize(df['HF'])
df[['AF']] = standardize(df['AF'])
df[['HC']] = standardize(df['HC'])
df[['AC']] = standardize(df['AC'])
df[['HY']] = standardize(df['HY'])
df[['AY']] = standardize(df['AY'])
df[['HR']] = standardize(df['HR'])
df[['AR']] = standardize(df['AR'])
df[['HW']] = standardize(df['HW'])
df[['AW']] = standardize(df['AW'])


df.head(10)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HF,AF,HC,AC,HY,AY,HR,AR,HW,AW
0,10/08/2018,Man United,Leicester,0.329171,-0.214371,H,0.373461,-0.748986,H,A Marriner,...,0.257622,-0.658817,-1.248486,0.16405,0.387873,-0.566612,-0.222988,-0.277274,1.810249,-0.19992
1,11/08/2018,Bournemouth,Cardiff,0.329171,-1.062924,H,0.373461,-0.748986,H,K Friend,...,0.257622,-0.373029,0.436261,-0.202649,-0.43097,-0.566612,-0.222988,-0.277274,-0.707925,-1.055063
2,11/08/2018,Fulham,Crystal Palace,-1.196257,0.634181,A,-0.789778,0.556586,A,M Dean,...,-0.35043,0.198548,-0.237638,0.16405,-0.43097,0.261513,-0.222988,-0.277274,-0.629046,-0.392365
3,11/08/2018,Huddersfield,Chelsea,-1.196257,1.482734,A,-0.789778,1.862158,A,C Kavanagh,...,-0.35043,-0.658817,-1.248486,0.16405,0.387873,-0.566612,-0.222988,-0.277274,-0.853913,1.364867
4,11/08/2018,Newcastle,Tottenham,-0.433543,0.634181,A,0.373461,1.862158,A,M Atkinson,...,0.257622,0.484336,-0.911537,0.16405,0.387873,0.261513,-0.222988,-0.277274,-0.75527,0.261229
5,11/08/2018,Watford,Brighton,0.329171,-1.062924,H,0.373461,-0.748986,H,J Moss,...,-0.046404,1.627489,0.77321,-0.936048,0.387873,0.261513,-0.222988,-0.277274,-0.625093,-0.676391
6,11/08/2018,Wolves,Everton,0.329171,0.634181,D,0.373461,0.556586,D,C Pawson,...,-0.654455,-0.944606,-0.911537,0.530749,-1.249814,-0.566612,-0.222988,3.355972,-0.790757,0.229695
7,12/08/2018,Arsenal,Man City,-1.196257,0.634181,A,-0.789778,0.556586,A,M Oliver,...,0.257622,1.055913,-1.248486,1.630846,0.387873,0.261513,-0.222988,-0.277274,0.399178,2.559154
8,12/08/2018,Liverpool,West Ham,1.854599,-1.062924,H,1.536699,-0.748986,H,A Taylor,...,1.169699,-0.373029,-0.237638,-0.202649,-0.43097,0.261513,-0.222988,-0.277274,1.491002,-0.341111
9,12/08/2018,Southampton,Burnley,-1.196257,-1.062924,D,-0.789778,-0.748986,D,G Scott,...,-0.046404,-0.373029,0.77321,0.16405,-1.249814,-0.566612,-0.222988,-0.277274,-0.297719,-0.790801


## One-hot Encoding 


In [387]:
import numpy as np
def oneHot(column):
    arr = []
    for i in column:
        if(i in arr):
            continue
        else:
            arr.append(i)
    matrix = [[0 for x in range(len(arr))] for x in range(len(column))]
    j = 0
    for i in column:
        matrix[j][arr.index(i)] = 1
        j += 1
    df2 = pd.DataFrame(np.array(matrix),columns= arr)
    return df2


data1 = oneHot(df['HomeTeam'])
data2 = oneHot(df['AwayTeam'])
temp = data2.columns.tolist()
for i in temp:
    data2 = data2.rename(columns = { i : "Away_"+i})
data3 = oneHot(df['Referee'])
df = df.drop('HomeTeam',axis=1)
df = df.drop('AwayTeam',axis=1)
df = df.drop('Referee',axis=1)
frames = [data1, data2 ,data3, df]

data = pd.concat(frames,axis=1, sort=False)

data
            

Unnamed: 0,Man United,Bournemouth,Fulham,Huddersfield,Newcastle,Watford,Wolves,Arsenal,Liverpool,Southampton,...,HF,AF,HC,AC,HY,AY,HR,AR,HW,AW
0,1,0,0,0,0,0,0,0,0,0,...,0.257622,-0.658817,-1.248486,0.164050,0.387873,-0.566612,-0.222988,-0.277274,1.810249,-0.199920
1,0,1,0,0,0,0,0,0,0,0,...,0.257622,-0.373029,0.436261,-0.202649,-0.430970,-0.566612,-0.222988,-0.277274,-0.707925,-1.055063
2,0,0,1,0,0,0,0,0,0,0,...,-0.350430,0.198548,-0.237638,0.164050,-0.430970,0.261513,-0.222988,-0.277274,-0.629046,-0.392365
3,0,0,0,1,0,0,0,0,0,0,...,-0.350430,-0.658817,-1.248486,0.164050,0.387873,-0.566612,-0.222988,-0.277274,-0.853913,1.364867
4,0,0,0,0,1,0,0,0,0,0,...,0.257622,0.484336,-0.911537,0.164050,0.387873,0.261513,-0.222988,-0.277274,-0.755270,0.261229
5,0,0,0,0,0,1,0,0,0,0,...,-0.046404,1.627489,0.773210,-0.936048,0.387873,0.261513,-0.222988,-0.277274,-0.625093,-0.676391
6,0,0,0,0,0,0,1,0,0,0,...,-0.654455,-0.944606,-0.911537,0.530749,-1.249814,-0.566612,-0.222988,3.355972,-0.790757,0.229695
7,0,0,0,0,0,0,0,1,0,0,...,0.257622,1.055913,-1.248486,1.630846,0.387873,0.261513,-0.222988,-0.277274,0.399178,2.559154
8,0,0,0,0,0,0,0,0,1,0,...,1.169699,-0.373029,-0.237638,-0.202649,-0.430970,0.261513,-0.222988,-0.277274,1.491002,-0.341111
9,0,0,0,0,0,0,0,0,0,1,...,-0.046404,-0.373029,0.773210,0.164050,-1.249814,-0.566612,-0.222988,-0.277274,-0.297719,-0.790801


In [388]:
data = data.drop('Date', axis=1)
data = data.drop('HTR', axis=1)
data = data.drop('FTHG', axis=1)
data = data.drop('FTAG',axis=1)
data = data.drop('HTAG', axis=1)
data = data.drop('HTHG',axis=1)
cols = data.columns.tolist()
cols.remove('FTR')
cols.append('FTR') 
data = data[cols]
data.head()

Unnamed: 0,Man United,Bournemouth,Fulham,Huddersfield,Newcastle,Watford,Wolves,Arsenal,Liverpool,Southampton,...,AF,HC,AC,HY,AY,HR,AR,HW,AW,FTR
0,1,0,0,0,0,0,0,0,0,0,...,-0.658817,-1.248486,0.16405,0.387873,-0.566612,-0.222988,-0.277274,1.810249,-0.19992,H
1,0,1,0,0,0,0,0,0,0,0,...,-0.373029,0.436261,-0.202649,-0.43097,-0.566612,-0.222988,-0.277274,-0.707925,-1.055063,H
2,0,0,1,0,0,0,0,0,0,0,...,0.198548,-0.237638,0.16405,-0.43097,0.261513,-0.222988,-0.277274,-0.629046,-0.392365,A
3,0,0,0,1,0,0,0,0,0,0,...,-0.658817,-1.248486,0.16405,0.387873,-0.566612,-0.222988,-0.277274,-0.853913,1.364867,A
4,0,0,0,0,1,0,0,0,0,0,...,0.484336,-0.911537,0.16405,0.387873,0.261513,-0.222988,-0.277274,-0.75527,0.261229,A


In [389]:
from sklearn.model_selection import train_test_split

y = data.iloc[:, 72].values
X = data.iloc[:,:72].values

split_test_size = 0.20

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42)     

from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(60, 2), random_state=1)

clf.fit(X_train, y_train)                         
MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False,
              epsilon=1e-08, hidden_layer_sizes=(60, 2),
              learning_rate='constant', learning_rate_init=0.001,
              max_iter=200, momentum=0.9, n_iter_no_change=10,
              nesterovs_momentum=True, power_t=0.5, random_state=1,
              shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)
predictions = clf.predict(X_test)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, predictions)))

Accuracy: 0.6974


In [390]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train.ravel())
rf_predict_train = rf_model.predict(X_test)
# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_train)))
print(rf_predict_train)



Accuracy: 0.6184
['H' 'D' 'A' 'H' 'H' 'D' 'H' 'A' 'A' 'A' 'A' 'H' 'H' 'A' 'H' 'A' 'H' 'H'
 'A' 'A' 'A' 'H' 'A' 'H' 'A' 'H' 'A' 'A' 'A' 'A' 'H' 'H' 'H' 'A' 'H' 'H'
 'A' 'A' 'H' 'D' 'A' 'A' 'H' 'H' 'H' 'A' 'H' 'H' 'H' 'H' 'H' 'A' 'H' 'H'
 'H' 'H' 'A' 'D' 'H' 'H' 'A' 'H' 'H' 'D' 'H' 'H' 'A' 'A' 'H' 'H' 'H' 'H'
 'A' 'A' 'H' 'H']


