In [1]:
import os
import glob
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
os.chdir("C:/Users/mycoo/OneDrive/Documents/SportsAnalytics/Machine-Learning-with-the-NFL/Predicting-PremLeague-Outcomes-With-ML/files/Datasets")

Data Taken from http://www.football-data.co.uk/englandm.php

In [2]:
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

In [3]:
#combine all files in the list
master_sheet = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
master_sheet.to_csv( "master_sheet.csv", index=False, encoding='utf-8-sig')

Reduce table to using columns needed for the model

Columns Required:

Date, Time HomeTeam, AwayTeam, FTHG, FTAG, FTR, HTHG, HTAG, HS, AS, HST, AST, HC, AC, HF, AF, HR, AR

Columns To Consider:

Attendance, Referee

In [4]:
subset_master = master_sheet[["Date", "Time", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR", "HTHG", "HTAG", "HS", "AS", "HST", "AST", "HC", "AC", "HF", "AF", "HR", "AR"]]
subset_master = subset_master.fillna(0)
# 0 = Home Wins, 1 = Draw, 2 = Away Wins
subset_master.FTR.replace(['H', 'D', 'A'], [0, 1, 2], inplace = True)

In [5]:
subset_master.head()

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HS,AS,HST,AST,HC,AC,HF,AF,HR,AR
0,14/08/10,0,Aston Villa,West Ham,3.0,0.0,0,2.0,0.0,23.0,12.0,11.0,2.0,16.0,7.0,15.0,15.0,0.0,0.0
1,14/08/10,0,Blackburn,Everton,1.0,0.0,0,1.0,0.0,7.0,17.0,2.0,12.0,1.0,3.0,19.0,14.0,0.0,0.0
2,14/08/10,0,Bolton,Fulham,0.0,0.0,1,0.0,0.0,13.0,12.0,9.0,7.0,4.0,8.0,12.0,13.0,0.0,0.0
3,14/08/10,0,Chelsea,West Brom,6.0,0.0,0,2.0,0.0,18.0,10.0,13.0,4.0,3.0,1.0,10.0,10.0,0.0,0.0
4,14/08/10,0,Sunderland,Birmingham,2.0,2.0,1,1.0,0.0,6.0,13.0,2.0,7.0,3.0,6.0,13.0,10.0,1.0,0.0


In [6]:
msk = np.random.rand(len(subset_master)) < 0.70

train_df = subset_master[msk]
test_df = subset_master[~msk]

X_train = train_df.drop(columns = ['Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTR'])
y_train = train_df[['FTR']] 
X_test = test_df.drop(columns = ['Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTR'])
y_test = test_df[['FTR']]

In [7]:
clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=10000, multi_class='ovr', verbose=0)

clf.fit(X_train, np.ravel(y_train.values))

y_pred = clf.predict_proba(X_test)
y_pred = pd.DataFrame(y_pred)

In [8]:
subset_master.loc[:, 'HomeWinPerc'] = y_pred[0]
subset_master.loc[:, 'DrawPerc'] = y_pred[1]
subset_master.loc[:, 'AwayPerc'] = y_pred[2]

conditions = [
    (subset_master['HomeWinPerc'] >= subset_master['DrawPerc']) & (subset_master['HomeWinPerc'] >= subset_master['AwayPerc']), #Home Condition
    (subset_master['HomeWinPerc'] <= subset_master['DrawPerc']) & (subset_master['DrawPerc'] >= subset_master['AwayPerc']), #Draw Condition
    (subset_master['HomeWinPerc'] <= subset_master['DrawPerc']) & (subset_master['DrawPerc'] <= subset_master['AwayPerc']) #Away Condition
]

values = [0, 1, 2]

subset_master["PredResult"] = np.select(conditions, values)

subset_master.head()

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HS,...,HC,AC,HF,AF,HR,AR,HomeWinPerc,DrawPerc,AwayPerc,PredResult
0,14/08/10,0,Aston Villa,West Ham,3.0,0.0,0,2.0,0.0,23.0,...,16.0,7.0,15.0,15.0,0.0,0.0,0.6702709,0.329729,9.715962e-09,0
1,14/08/10,0,Blackburn,Everton,1.0,0.0,0,1.0,0.0,7.0,...,1.0,3.0,19.0,14.0,0.0,0.0,0.03668428,0.947204,0.01611158,1
2,14/08/10,0,Bolton,Fulham,0.0,0.0,1,0.0,0.0,13.0,...,4.0,8.0,12.0,13.0,0.0,0.0,5.199766e-21,0.238048,0.7619516,2
3,14/08/10,0,Chelsea,West Brom,6.0,0.0,0,2.0,0.0,18.0,...,3.0,1.0,10.0,10.0,0.0,0.0,0.8335284,0.166472,6.968158e-19,0
4,14/08/10,0,Sunderland,Birmingham,2.0,2.0,1,1.0,0.0,6.0,...,3.0,6.0,13.0,10.0,1.0,0.0,0.8484048,0.151595,4.823794999999999e-19,0
