In [159]:
import pandas as pd                 # pandas is a dataframe library
import matplotlib.pyplot as plt      # matplotlib.pyplot plots data

%matplotlib inline
df =pd.read_csv('season-1819.csv', encoding="utf-8-sig")

In [160]:
df.shape

(290, 22)

In [161]:
df.head(5)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,10/8/2018,Man United,Leicester,2,1,H,1,0,H,A Marriner,...,6,4,11,8,2,5,2,1,0,0
1,11/8/2018,Bournemouth,Cardiff,2,0,H,1,0,H,K Friend,...,4,1,11,9,7,4,1,1,0,0
2,11/8/2018,Fulham,Crystal Palace,0,2,A,0,1,A,M Dean,...,6,9,9,11,5,5,1,2,0,0
3,11/8/2018,Huddersfield,Chelsea,0,3,A,0,2,A,C Kavanagh,...,1,4,9,8,2,5,2,1,0,0
4,11/8/2018,Newcastle,Tottenham,1,2,A,1,2,A,M Atkinson,...,2,5,11,12,3,5,2,2,0,0


In [162]:
df.tail()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
285,2/3/2019,Wolves,Cardiff,2,0,H,2,0,H,A Marriner,...,6,4,11,6,7,8,1,2,0,0
286,3/3/2019,Everton,Liverpool,0,0,D,0,0,D,M Atkinson,...,3,3,12,10,3,7,1,2,0,0
287,3/3/2019,Fulham,Chelsea,1,2,A,1,2,A,G Scott,...,5,7,11,10,5,4,2,1,0,0
288,3/3/2019,Watford,Leicester,2,1,H,1,0,H,J Moss,...,5,2,15,12,1,5,5,1,0,0
289,30/01/19,Arsenal,Tottenham,5,0,H,3,0,H,J Moss,...,12,3,3,10,6,2,1,1,0,0


### Definition of features
From the metadata on the data source we have the following definition of the features.

| Feature  | Description | Comments |
|--------------|-------------|--------|
|HS | Home Shots|
|AS | Away Shots|
| FTHG     | Full Time HomeTeam Goal         |
| FTAG | Full Time AwayTeam Goal       |
| FTR | Full Time Result | It's what we want to predict |
| HTHG | Half Time HomeTeam Goal|
| HTAG | Half Time AwayTeam Goal |
| HST |  HomeTeam Shots on Tiger |
| AST |  AwayTeam Shots on Tiger |
| HF | HomeTeam Fouls|
| AF | AwayTeam Fouls | 
| HC | HomeTeam Corners |  
| AC | AwayTeam Corners|
| HY | HomeTeam YellowCards|
| AY | AwayTeam YellowCards|
| HR | HomeTeam RedCards|
| AR | AwayTeam RedCards|


In [163]:
df.isnull().values.any()

False

## Data standardization


In [165]:
from sklearn import preprocessing
df[['AS']] = preprocessing.scale(df[['AS']])
df[['HS']] = preprocessing.scale(df[['HS']])
df[['FTHG']] = preprocessing.scale(df[['FTHG']])
df[['FTAG']] = preprocessing.scale(df[['FTAG']])
df[['HTHG']] = preprocessing.scale(df[['HTHG']])
df[['HTAG']] = preprocessing.scale(df[['HTAG']])
df[['HST']] = preprocessing.scale(df[['HST']])
df[['AST']] = preprocessing.scale(df[['AST']])
df[['HF']] = preprocessing.scale(df[['HF']])
df[['AF']] = preprocessing.scale(df[['AF']])
df[['HC']] = preprocessing.scale(df[['HC']])
df[['AC']] = preprocessing.scale(df[['AC']])
df[['HY']] = preprocessing.scale(df[['HY']])
df[['AY']] = preprocessing.scale(df[['AY']])
df[['HR']] = preprocessing.scale(df[['HR']])
df[['AR']] = preprocessing.scale(df[['AR']])
df.head(10)

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,10/8/2018,Man United,Leicester,0.314085,-0.213512,H,0.32906,-0.742201,H,A Marriner,...,0.42624,0.032346,0.163848,-0.663716,-1.261698,0.157677,0.371912,-0.595761,-0.23355,-0.300376
1,11/8/2018,Bournemouth,Cardiff,0.314085,-1.073489,H,0.32906,-0.742201,H,K Friend,...,-0.311728,-1.307688,0.163848,-0.382727,0.454495,-0.208134,-0.451404,-0.595761,-0.23355,-0.300376
2,11/8/2018,Fulham,Crystal Palace,-1.166967,0.646466,A,-0.793614,0.538979,A,M Dean,...,0.42624,2.265736,-0.453242,0.179252,-0.231982,0.157677,-0.451404,0.230893,-0.23355,-0.300376
3,11/8/2018,Huddersfield,Chelsea,-1.166967,1.506444,A,-0.793614,1.82016,A,C Kavanagh,...,-1.41868,0.032346,-0.453242,-0.663716,-1.261698,0.157677,0.371912,-0.595761,-0.23355,-0.300376
4,11/8/2018,Newcastle,Tottenham,-0.426441,0.646466,A,0.32906,1.82016,A,M Atkinson,...,-1.049696,0.479024,0.163848,0.460241,-0.91846,0.157677,0.371912,0.230893,-0.23355,-0.300376
5,11/8/2018,Watford,Brighton,0.314085,-1.073489,H,0.32906,-0.742201,H,J Moss,...,0.057256,-1.754366,-0.144697,1.584197,0.797734,-0.939757,0.371912,0.230893,-0.23355,-0.300376
6,11/8/2018,Wolves,Everton,0.314085,0.646466,D,0.32906,0.538979,D,C Pawson,...,-0.311728,0.479024,-0.761787,-0.944705,-0.91846,0.523489,-1.27472,-0.595761,-0.23355,3.329164
7,12/8/2018,Arsenal,Man City,-1.166967,0.646466,A,-0.793614,0.538979,A,M Oliver,...,-0.680712,1.819058,0.163848,1.022219,-1.261698,1.620924,0.371912,0.230893,-0.23355,-0.300376
8,12/8/2018,Liverpool,West Ham,1.795138,-1.073489,H,1.451734,-0.742201,H,A Taylor,...,1.164208,-0.86101,1.089483,-0.382727,-0.231982,-0.208134,-0.451404,0.230893,-0.23355,-0.300376
9,12/8/2018,Southampton,Burnley,-1.166967,-1.073489,D,-0.793614,-0.742201,D,G Scott,...,-0.680712,0.925702,-0.144697,-0.382727,0.797734,0.157677,-1.27472,-0.595761,-0.23355,-0.300376


## One-hot Encoding 


In [166]:
df = pd.get_dummies(df, columns=['HomeTeam', 'AwayTeam', 'Referee'])
df.head(5)

Unnamed: 0,Date,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,...,Referee_K Friend,Referee_L Mason,Referee_L Probert,Referee_M Atkinson,Referee_M Dean,Referee_M Oliver,Referee_P Tierney,Referee_R East,Referee_S Attwell,Referee_S Hooper
0,10/8/2018,0.314085,-0.213512,H,0.32906,-0.742201,H,-1.032374,0.483825,0.42624,...,0,0,0,0,0,0,0,0,0,0
1,11/8/2018,0.314085,-1.073489,H,0.32906,-0.742201,H,-0.355024,-0.213075,-0.311728,...,1,0,0,0,0,0,0,0,0,0
2,11/8/2018,-1.166967,0.646466,A,-0.793614,0.538979,A,0.152987,-0.213075,0.42624,...,0,0,0,0,1,0,0,0,0,0
3,11/8/2018,-1.166967,1.506444,A,-0.793614,1.82016,A,-1.371048,0.483825,-1.41868,...,0,0,0,0,0,0,0,0,0,0
4,11/8/2018,-0.426441,0.646466,A,0.32906,1.82016,A,0.152987,0.948426,-1.049696,...,0,0,0,1,0,0,0,0,0,0


In [167]:
df = df.drop('Date', axis=1)
df = df.drop('HTR', axis=1)
df = df.drop('FTHG', axis=1)
df = df.drop('FTAG', axis=1)
df = df.drop('HTHG', axis=1)
df = df.drop('HTAG', axis=1)
df.head()
cols = df.columns.tolist()
cols.remove('FTR')
cols.append('FTR') 
df = df[cols]
df

Unnamed: 0,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,...,Referee_L Mason,Referee_L Probert,Referee_M Atkinson,Referee_M Dean,Referee_M Oliver,Referee_P Tierney,Referee_R East,Referee_S Attwell,Referee_S Hooper,FTR
0,-1.032374,0.483825,0.426240,0.032346,0.163848,-0.663716,-1.261698,0.157677,0.371912,-0.595761,...,0,0,0,0,0,0,0,0,0,H
1,-0.355024,-0.213075,-0.311728,-1.307688,0.163848,-0.382727,0.454495,-0.208134,-0.451404,-0.595761,...,0,0,0,0,0,0,0,0,0,H
2,0.152987,-0.213075,0.426240,2.265736,-0.453242,0.179252,-0.231982,0.157677,-0.451404,0.230893,...,0,0,0,1,0,0,0,0,0,A
3,-1.371048,0.483825,-1.418680,0.032346,-0.453242,-0.663716,-1.261698,0.157677,0.371912,-0.595761,...,0,0,0,0,0,0,0,0,0,A
4,0.152987,0.948426,-1.049696,0.479024,0.163848,0.460241,-0.918460,0.157677,0.371912,0.230893,...,0,0,1,0,0,0,0,0,0,A
5,0.830337,-1.142276,0.057256,-1.754366,-0.144697,1.584197,0.797734,-0.939757,0.371912,0.230893,...,0,0,0,0,0,0,0,0,0,H
6,-0.524362,-1.142276,-0.311728,0.479024,-0.761787,-0.944705,-0.918460,0.523489,-1.274720,-0.595761,...,0,0,0,0,0,0,0,0,0,D
7,-0.863036,1.413026,-0.680712,1.819058,0.163848,1.022219,-1.261698,1.620924,0.371912,0.230893,...,0,0,0,0,1,0,0,0,0,A
8,0.660999,-1.374576,1.164208,-0.861010,1.089483,-0.382727,-0.231982,-0.208134,-0.451404,0.230893,...,0,0,0,0,0,0,0,0,0,H
9,0.660999,1.180726,-0.680712,0.925702,-0.144697,-0.382727,0.797734,0.157677,-1.274720,-0.595761,...,0,0,0,0,0,0,0,0,0,D


In [168]:
from sklearn.model_selection import train_test_split

y = df.iloc[:, 70].values
X = df.iloc[:,:70].values

split_test_size = 0.40

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42)                             

In [171]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
rf_model = RandomForestClassifier(random_state=42)      # Create random forest object
rf_model.fit(X_train, y_train.ravel())
rf_predict_train = rf_model.predict(X_test)
# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_train)))
print(rf_predict_train)



Accuracy: 0.5431
['H' 'H' 'D' 'A' 'H' 'D' 'H' 'H' 'D' 'A' 'H' 'A' 'A' 'D' 'H' 'A' 'A' 'D'
 'A' 'A' 'H' 'H' 'H' 'A' 'H' 'A' 'H' 'H' 'A' 'A' 'A' 'H' 'A' 'H' 'H' 'H'
 'A' 'H' 'H' 'A' 'H' 'A' 'A' 'A' 'H' 'H' 'A' 'D' 'D' 'D' 'A' 'H' 'A' 'H'
 'A' 'H' 'D' 'H' 'H' 'D' 'A' 'A' 'A' 'A' 'H' 'H' 'H' 'D' 'H' 'H' 'A' 'A'
 'A' 'A' 'H' 'A' 'A' 'A' 'A' 'H' 'A' 'A' 'A' 'H' 'H' 'A' 'H' 'H' 'A' 'H'
 'H' 'A' 'D' 'H' 'A' 'H' 'H' 'A' 'A' 'H' 'D' 'A' 'A' 'A' 'H' 'H' 'D' 'H'
 'H' 'D' 'A' 'D' 'A' 'D' 'A' 'H']


