In [1]:
import os
import glob
import numpy as np
import pandas as pd
from functions import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
os.chdir("C:/Users/mycoo/OneDrive/Documents/SportsAnalytics/Machine-Learning-with-the-NFL/Predicting-PremLeague-Outcomes-With-ML/files/Datasets")

Data Taken from http://www.football-data.co.uk/englandm.php

In [2]:
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

In [3]:
#combine all files in the list
master_sheet = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
master_sheet.to_csv( "master_sheet.csv", index=False, encoding='utf-8-sig')

Reduce table to using columns needed for the model

Columns Required:

Date, Time HomeTeam, AwayTeam, FullTimeHG, FullTimeAG, FullTimeRresults, HalfTimeHG, HalfTimeAG, HomeShots, AwayShots, HomeShotsTarger, AwayShotsTarger, HomeCorners, AwayCorners, HomeFouls, AwayFouls, HomeRed, AwayRed

Columns To Consider:

Attendance, Referee

In [4]:
subset_master = master_sheet[["Date", "Time", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR", "HTHG", "HTAG", "HS", "AS", "HST", "AST", "HC", "AC", "HF", "AF", "HR", "AR"]]
subset_master = subset_master.fillna(0)
# 0 = Home Wins, 1 = Draw, 2 = Away Wins
subset_master.FTR.replace(['H', 'D', 'A'], [0, 1, 2], inplace = True)

In [5]:
subset_master.head()

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HS,AS,HST,AST,HC,AC,HF,AF,HR,AR
0,14/08/10,0,Aston Villa,West Ham,3.0,0.0,0,2.0,0.0,23.0,12.0,11.0,2.0,16.0,7.0,15.0,15.0,0.0,0.0
1,14/08/10,0,Blackburn,Everton,1.0,0.0,0,1.0,0.0,7.0,17.0,2.0,12.0,1.0,3.0,19.0,14.0,0.0,0.0
2,14/08/10,0,Bolton,Fulham,0.0,0.0,1,0.0,0.0,13.0,12.0,9.0,7.0,4.0,8.0,12.0,13.0,0.0,0.0
3,14/08/10,0,Chelsea,West Brom,6.0,0.0,0,2.0,0.0,18.0,10.0,13.0,4.0,3.0,1.0,10.0,10.0,0.0,0.0
4,14/08/10,0,Sunderland,Birmingham,2.0,2.0,1,1.0,0.0,6.0,13.0,2.0,7.0,3.0,6.0,13.0,10.0,1.0,0.0


In [6]:
msk = np.random.rand(len(subset_master)) < 0.75

train_df = subset_master[msk]
test_df = subset_master[~msk]

X_train = train_df.drop(columns = ['Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTR'])
y_train = train_df[['FTR']] 
X_test = test_df.drop(columns = ['Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTR'])
y_test = test_df[['FTR']]

In [7]:
clf = MLPClassifier(solver='lbfgs', alpha=.01,random_state=1)
clf.fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)
y_pred = pd.DataFrame(y_pred)

Identifying how often Predicted Results decided the outcome (win, draw, away) against the actual results)

In [8]:
metrics(subset_master, y_pred)

False    64.185538
True     35.814462
Name: Comparison, dtype: float64
0    45.281899
2    30.182727
1    24.535374
Name: FTR, dtype: float64
0    45.566141
2    29.742308
1    24.691551
Name: PredResult, dtype: float64


In [9]:
subset_master

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HS,...,AST,HC,AC,HF,AF,HR,AR,HomeWinPerc,DrawPerc,AwayPerc
0,14/08/10,0,Aston Villa,West Ham,3.0,0.0,0,2.0,0.0,23.0,...,2.0,16.0,7.0,15.0,15.0,0.0,0.0,3.547928e-11,1.000000e+00,3.866349e-08
1,14/08/10,0,Blackburn,Everton,1.0,0.0,0,1.0,0.0,7.0,...,12.0,1.0,3.0,19.0,14.0,0.0,0.0,5.714877e-12,9.999996e-01,3.900786e-07
2,14/08/10,0,Bolton,Fulham,0.0,0.0,1,0.0,0.0,13.0,...,7.0,4.0,8.0,12.0,13.0,0.0,0.0,2.985821e-41,2.334081e-09,1.000000e+00
3,14/08/10,0,Chelsea,West Brom,6.0,0.0,0,2.0,0.0,18.0,...,4.0,3.0,1.0,10.0,10.0,0.0,0.0,1.000000e+00,6.331211e-11,3.298107e-31
4,14/08/10,0,Sunderland,Birmingham,2.0,2.0,1,1.0,0.0,6.0,...,7.0,3.0,6.0,13.0,10.0,1.0,0.0,1.000000e+00,6.591558e-98,7.130591e-181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108697,07/02/2021,12:00,Tottenham,West Brom,2.0,0.0,0,0.0,0.0,13.0,...,1.0,8.0,0.0,16.0,12.0,0.0,0.0,,,
108698,07/02/2021,14:00,Wolves,Leicester,0.0,0.0,1,0.0,0.0,13.0,...,3.0,8.0,2.0,11.0,15.0,0.0,0.0,,,
108699,07/02/2021,16:30,Liverpool,Man City,1.0,4.0,2,0.0,0.0,8.0,...,5.0,6.0,1.0,13.0,8.0,0.0,0.0,,,
108700,07/02/2021,19:15,Sheffield United,Chelsea,1.0,2.0,2,0.0,1.0,8.0,...,3.0,1.0,8.0,8.0,8.0,0.0,0.0,,,
