# Machine Learning

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn import tree
import numpy as np

Read in eda dataframe ('significant data' used as discussed in EDA notebook)

In [2]:
df_in=pd.read_csv('eda_sig.csv')
df_in.drop(['Unnamed: 0'],axis=1,inplace=True)
df_in.head()

Unnamed: 0,id,fixture_id,G,A,CS,gw,MP,I,C,T,ICT,team,opp,home,opp_att,opp_def,att,def,Sh90,KP90
0,14.0,10,0.0,0.0,1.0,1.0,90.0,18.0,0.0,0.0,1.8,1.0,13.0,False,1040.0,1020.0,1190.0,1330.0,0.0,0.0
1,4.0,10,0.0,1.0,1.0,1.0,90.0,38.4,12.3,0.0,5.1,1.0,13.0,False,1040.0,1020.0,1190.0,1330.0,0.0,1.0
2,5.0,10,0.0,0.0,1.0,1.0,90.0,20.2,0.4,0.0,2.1,1.0,13.0,False,1040.0,1020.0,1190.0,1330.0,0.0,0.0
3,6.0,10,0.0,0.0,1.0,1.0,90.0,14.0,15.6,4.0,3.4,1.0,13.0,False,1040.0,1020.0,1190.0,1330.0,0.0,1.0
4,467.0,10,0.0,0.0,1.0,1.0,90.0,15.0,0.2,0.0,1.5,1.0,13.0,False,1040.0,1020.0,1190.0,1330.0,0.0,0.0


In [3]:
df_in.isna().sum()

id            0
fixture_id    0
G             0
A             0
CS            0
gw            0
MP            0
I             0
C             0
T             0
ICT           0
team          0
opp           0
home          0
opp_att       0
opp_def       0
att           0
def           0
Sh90          0
KP90          0
dtype: int64

In [4]:
df_in.describe()

Unnamed: 0,id,fixture_id,G,A,CS,gw,MP,I,C,T,ICT,team,opp,opp_att,opp_def,att,def,Sh90,KP90
count,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0,6077.0
mean,263.379134,135.000823,0.10729,0.094784,0.246832,13.986177,81.362514,18.191048,11.889106,13.737206,4.38106,10.08705,10.525588,1142.246174,1156.075366,1136.552575,1149.037354,1.195759,0.878614
std,152.926801,77.854884,0.347579,0.318769,0.431204,7.807371,18.473806,14.977992,14.868403,19.799702,3.628315,5.718301,5.764572,101.401502,119.37674,99.601639,118.248316,1.666963,1.341934
min,1.0,1.0,0.0,0.0,0.0,1.0,21.0,0.0,0.0,0.0,0.0,1.0,1.0,990.0,1020.0,990.0,1020.0,0.0,0.0
25%,135.0,67.0,0.0,0.0,0.0,7.0,81.0,8.0,1.0,0.0,1.9,5.0,6.0,1060.0,1050.0,1060.0,1050.0,0.0,0.0
50%,255.0,135.0,0.0,0.0,0.0,14.0,90.0,14.6,5.3,5.0,3.3,10.0,11.0,1120.0,1120.0,1120.0,1110.0,1.0,0.0
75%,399.0,202.0,0.0,0.0,0.0,21.0,90.0,24.2,17.0,20.0,5.7,15.0,16.0,1210.0,1280.0,1190.0,1280.0,2.0,1.13
max,621.0,270.0,3.0,3.0,1.0,27.0,180.0,131.0,127.7,176.0,26.8,20.0,20.0,1330.0,1370.0,1330.0,1370.0,18.0,14.21


In [5]:
df_in.dtypes

id            float64
fixture_id      int64
G             float64
A             float64
CS            float64
gw            float64
MP            float64
I             float64
C             float64
T             float64
ICT           float64
team          float64
opp           float64
home             bool
opp_att       float64
opp_def       float64
att           float64
def           float64
Sh90          float64
KP90          float64
dtype: object

In [6]:
df_in['home']=df_in['home'].astype(int)

Data is ready for ML processes.

## Setting up Model

Setup pipelines for classification models

In [7]:
pipe_log = Pipeline([('scl', StandardScaler()),
			('clf', LogisticRegression(random_state=14))])

In [8]:
pipe_svm = Pipeline([('scl', StandardScaler()),
			('clf', svm.SVC(random_state=14))])

In [9]:
pipe_dt = Pipeline([('scl', StandardScaler()),
			('clf', tree.DecisionTreeClassifier(random_state=14))])

In [10]:
pipelines = [pipe_log,pipe_svm,pipe_dt]

In [11]:
pipe_dict = {0: 'Logistic Regression', 1: 'Support Vector Machine', 2: 'Decision Tree'}

## Goals

Compare models precision for predicting goals scored

In [12]:
X_G_train, X_G_test, y_G_train, y_G_test = train_test_split(df_in[['Sh90','I','T']],df_in['G'],test_size=0.2, random_state=14, stratify=df_in['G'])

In [13]:
for pipe in pipelines:
    pipe.fit(X=X_G_train,y=y_G_train)

In [14]:
for idx,val in enumerate(pipelines):
    print(pipe_dict[idx], '\n', classification_report(y_true=y_G_test, y_pred=pipelines[idx].predict(X_G_test)))

Logistic Regression 
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      1099
         1.0       0.75      0.76      0.76       105
         2.0       0.90      0.90      0.90        10
         3.0       1.00      1.00      1.00         2

    accuracy                           0.96      1216
   macro avg       0.91      0.91      0.91      1216
weighted avg       0.96      0.96      0.96      1216

Support Vector Machine 
               precision    recall  f1-score   support

         0.0       0.99      0.98      0.99      1099
         1.0       0.82      0.92      0.87       105
         2.0       0.73      0.80      0.76        10
         3.0       1.00      0.50      0.67         2

    accuracy                           0.97      1216
   macro avg       0.89      0.80      0.82      1216
weighted avg       0.98      0.97      0.98      1216

Decision Tree 
               precision    recall  f1-score   support

         0.0

Decide to use SVM for its superior precision in calculating 1 goal scored (the most common non zero amount)

In [15]:
pipe_g=Pipeline([('scl', StandardScaler()),
			('clf', svm.SVC(random_state=14))])

In [16]:
pipe_g.fit(X=X_G_train,y=y_G_train)

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=14, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [17]:
print(classification_report(y_true=y_G_test, y_pred=pipe_g.predict(X_G_test)))

precision    recall  f1-score   support

         0.0       0.99      0.98      0.99      1099
         1.0       0.82      0.92      0.87       105
         2.0       0.73      0.80      0.76        10
         3.0       1.00      0.50      0.67         2

    accuracy                           0.97      1216
   macro avg       0.89      0.80      0.82      1216
weighted avg       0.98      0.97      0.98      1216



For future development, hyperparameter optimization and possible employment of an ensemble method with voting that incorporates SVM LogReg and DT methods can be used to improve precision.

## Assists 

In [18]:
X_A_train, X_A_test, y_A_train, y_A_test = train_test_split(df_in[['KP90','I','C']],df_in['A'],test_size=0.2, random_state=14, stratify=df_in['A'])

In [19]:
for pipe in pipelines:
    pipe.fit(X=X_A_train,y=y_A_train)

In [20]:
for idx,val in enumerate(pipelines):
    print(pipe_dict[idx], '\n', classification_report(y_true=y_A_test, y_pred=pipelines[idx].predict(X_A_test)))

Logistic Regression 
               precision    recall  f1-score   support

         0.0       0.93      0.99      0.96      1110
         1.0       0.47      0.14      0.22        98
         2.0       0.00      0.00      0.00         7
         3.0       0.00      0.00      0.00         1

    accuracy                           0.91      1216
   macro avg       0.35      0.28      0.29      1216
weighted avg       0.88      0.91      0.89      1216

Support Vector Machine 
               precision    recall  f1-score   support

         0.0       0.92      0.99      0.95      1110
         1.0       0.45      0.10      0.17        98
         2.0       0.00      0.00      0.00         7
         3.0       0.00      0.00      0.00         1

    accuracy                           0.91      1216
   macro avg       0.34      0.27      0.28      1216
weighted avg       0.88      0.91      0.89      1216

Decision Tree 
               precision    recall  f1-score   support

         0.0

The accuracies are quite poor for all methods. We choose logistic regression for its superior accuracy of 1 assist predictions. This accuracy is still poor, future development must consider data accuracy, hyperparameter optimization and ensemble methods.

In [21]:
pipe_a=Pipeline([('scl', StandardScaler()),
			('clf', LogisticRegression(random_state=14))])

In [22]:
pipe_a.fit(X=X_A_train,y=y_A_train)

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=14,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [23]:
print(classification_report(y_true=y_A_test, y_pred=pipe_a.predict(X_A_test)))

precision    recall  f1-score   support

         0.0       0.93      0.99      0.96      1110
         1.0       0.47      0.14      0.22        98
         2.0       0.00      0.00      0.00         7
         3.0       0.00      0.00      0.00         1

    accuracy                           0.91      1216
   macro avg       0.35      0.28      0.29      1216
weighted avg       0.88      0.91      0.89      1216



## Clean Sheets

In [24]:
X_CS_train, X_CS_test, y_CS_train, y_CS_test = train_test_split(df_in[['opp_att','def','home']],df_in['CS'],test_size=0.2, random_state=14, stratify=df_in['CS'])

In [25]:
for pipe in pipelines:
    pipe.fit(X=X_CS_train,y=y_CS_train)

In [26]:
for idx,val in enumerate(pipelines):
    print(pipe_dict[idx], '\n', classification_report(y_true=y_CS_test, y_pred=pipelines[idx].predict(X_CS_test)))

Logistic Regression 
               precision    recall  f1-score   support

         0.0       0.75      1.00      0.86       916
         1.0       0.00      0.00      0.00       300

    accuracy                           0.75      1216
   macro avg       0.38      0.50      0.43      1216
weighted avg       0.57      0.75      0.65      1216

Support Vector Machine 
               precision    recall  f1-score   support

         0.0       0.75      1.00      0.86       916
         1.0       0.00      0.00      0.00       300

    accuracy                           0.75      1216
   macro avg       0.38      0.50      0.43      1216
weighted avg       0.57      0.75      0.65      1216

Decision Tree 
               precision    recall  f1-score   support

         0.0       0.89      0.94      0.92       916
         1.0       0.78      0.66      0.72       300

    accuracy                           0.87      1216
   macro avg       0.84      0.80      0.82      1216
weighted av

Logistic regression and SVM methods fail to predict clean sheet occurances at all. Decision tree accuracy is 78% for clean sheet and 89% for non clean sheet, this is fairly accurate but further optimization can be done.

In [28]:
pipe_cs=Pipeline([('scl', StandardScaler()),
			('clf', tree.DecisionTreeClassifier(random_state=14))])

In [29]:
pipe_cs.fit(X=X_CS_train,y=y_CS_train)

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort='deprecated', random_state=14,
                                        splitter='best'))],
         verbose=False)

In [30]:
print(classification_report(y_true=y_CS_test, y_pred=pipe_cs.predict(X_CS_test)))

precision    recall  f1-score   support

         0.0       0.89      0.94      0.92       916
         1.0       0.78      0.66      0.72       300

    accuracy                           0.87      1216
   macro avg       0.84      0.80      0.82      1216
weighted avg       0.87      0.87      0.87      1216



# Points, Captain & Vice Captain Calculator

Input FPL team. Select player's ID, position, team, opposition team and whether they play home or not. Alternatively read from a csv file

In [31]:
team_df=pd.DataFrame(np.array([[131,1,7,18,1],[48,1,4,20,0],[258,2,13,16,0],[407,2,20,4,1],[182,2,10,3,1],[128,2,7,18,1],[65,2,3,10,0],[301,3,15,14,1],[191,3,10,3,1],[463,3,6,8,1],[171,3,9,2,1],[215,3,11,12,0],[409,4,20,4,1],[313,4,16,13,1],[187,4,10,3,1]]),columns=['id','pos','team','opp','home'])

In [32]:
#team_df=pd.read_csv('myteam.csv')
#team_df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [33]:
team_df.head()

Unnamed: 0,id,pos,team,opp,home
0,131,1,7,18,1
1,48,1,4,20,0
2,258,2,13,16,0
3,407,2,20,4,1
4,182,2,10,3,1


In [14]:
#team_df.to_csv('myteam.csv')

Pull and merge ratings for teams and oppositions that are relevant to CS prediction. This process is similar to the one in EDA notebook.

In [35]:
ratings_df=pd.read_csv('data_clean/teams_clean.csv')

In [36]:
ratings_df=ratings_df[['id','strength_attack_home','strength_attack_away','strength_defence_home','strength_defence_away']]


In [37]:
ratings_df.head()

Unnamed: 0,id,strength_attack_home,strength_attack_away,strength_defence_home,strength_defence_away
0,1,1150,1190,1280,1330
1,2,1030,1060,1030,1050
2,3,1030,1070,1110,1180
3,4,1040,1160,1030,1130
4,5,1060,1130,1050,1050


In [38]:
ratings_df.rename(columns={'id':'opp'},inplace=True)

In [39]:
team_df=pd.merge(team_df,ratings_df,on="opp",how='left')

In [40]:
team_df.rename(columns={'strength_attack_home':'opp_att_h', 'strength_attack_away':'opp_att_a', 'strength_defence_home': 'opp_def_h', 'strength_defence_away':'opp_def_a'},inplace=True)

In [41]:
team_df["opp_att"]=np.where(team_df["home"],team_df["opp_att_a"],team_df["opp_att_h"])

In [42]:
ratings_df.rename(columns={'opp':'team'},inplace=True)

In [43]:
team_df=pd.merge(team_df,ratings_df,on="team",how='left')

In [44]:
team_df.rename(columns={'strength_attack_home':'att_h', 'strength_attack_away':'att_a', 'strength_defence_home': 'def_h', 'strength_defence_away':'def_a'},inplace=True)

In [45]:
team_df["def"]=np.where(team_df["home"],team_df["def_h"],team_df["def_a"])

In [46]:
team_df.drop(['opp_att_h','opp_def_h','opp_att_a','opp_def_a','att_h','att_a','def_h','def_a'],axis=1,inplace=True)

In [47]:
team_df.head()

Unnamed: 0,id,pos,team,opp,home,opp_att,def
0,131,1,7,18,1,1120,1040
1,48,1,4,20,0,1110,1130
2,258,2,13,16,0,1010,1030
3,407,2,20,4,1,1160,1080
4,182,2,10,3,1,1070,1340


Now merge the players average personal stats over last 5 gameweeks. This is a simple attempt at understanding a players current form rather than using their season average. A better system will be implemented in further development.

In [48]:
df_in.columns

Index(['id', 'fixture_id', 'G', 'A', 'CS', 'gw', 'MP', 'I', 'C', 'T', 'ICT',
       'team', 'opp', 'home', 'opp_att', 'opp_def', 'att', 'def', 'Sh90',
       'KP90'],
      dtype='object')

In [50]:
temp_df=df_in[['id','gw','MP','I','C','T','KP90','Sh90']].copy()

In [51]:
team_df['id'].tolist()

[131, 48, 258, 407, 182, 128, 65, 301, 191, 463, 171, 215, 409, 313, 187]

In [52]:
temp_df=temp_df[temp_df['id'].isin(team_df['id'].tolist())]

In [53]:
temp_df=temp_df[temp_df['gw']>22]

In [54]:
avgsts_df=temp_df.groupby('id',as_index=False)['MP','I','C','T','KP90','Sh90'].mean()

In [55]:
team_df=pd.merge(team_df,avgsts_df,on='id',how='left')

In [56]:
team_df.fillna(0,inplace=True)

In [57]:
team_df.head()

Unnamed: 0,id,pos,team,opp,home,opp_att,def,MP,I,C,T,KP90,Sh90
0,131,1,7,18,1,1120,1040,90.0,24.2,0.0,0.0,0.0,0.0
1,48,1,4,20,0,1110,1130,0.0,0.0,0.0,0.0,0.0,0.0
2,258,2,13,16,0,1010,1030,90.0,15.8,0.24,9.0,0.0,0.6
3,407,2,20,4,1,1160,1080,90.0,20.64,6.44,12.0,0.6,1.2
4,182,2,10,3,1,1070,1340,105.4,32.88,45.02,11.6,2.318,1.706


Predict target feature variables through ML pipelines.

In [58]:
team_df['G']=pipe_g.predict(team_df[['Sh90','I','T']])

In [59]:
team_df['CS']=pipe_cs.predict(team_df[['opp_att','def','home']])

In [60]:
team_df['A']=pipe_cs.predict(team_df[['KP90','I','C']])

In [61]:
team_df

Unnamed: 0,id,pos,team,opp,home,opp_att,def,MP,I,C,T,KP90,Sh90,G,CS,A
0,131,1,7,18,1,1120,1040,90.0,24.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,48,1,4,20,0,1110,1130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,258,2,13,16,0,1010,1030,90.0,15.8,0.24,9.0,0.0,0.6,0.0,0.0,0.0
3,407,2,20,4,1,1160,1080,90.0,20.64,6.44,12.0,0.6,1.2,0.0,0.0,1.0
4,182,2,10,3,1,1070,1340,105.4,32.88,45.02,11.6,2.318,1.706,0.0,0.0,1.0
5,128,2,7,18,1,1120,1040,75.5,12.3,0.1,0.0,0.0,0.0,0.0,0.0,0.0
6,65,2,3,10,0,1300,1180,90.0,18.75,10.725,1.5,0.5,0.25,0.0,0.0,1.0
7,301,3,15,14,1,1000,1060,90.0,17.8,22.44,19.4,1.2,1.6,0.0,0.0,1.0
8,191,3,10,3,1,1070,1340,106.8,48.04,30.5,96.4,2.106,4.518,1.0,0.0,1.0
9,463,3,6,8,1,1190,1310,69.2,12.88,22.8,20.0,1.614,1.778,0.0,0.0,1.0


Distribute points for each player, based on predicted features.

In [62]:
team_df['Pts']=np.where(team_df['MP']>59,2,np.where(team_df['MP']>0,1,0))

In [63]:
team_df['Pts']=team_df['Pts']+np.where(team_df['CS']==1,np.where(np.logical_or(team_df['pos']==1,team_df['pos']==2),4,np.where(team_df['pos']==3,1,0)),0)

In [64]:
team_df['Pts']=team_df['Pts']+np.where(team_df['A']>0,team_df['A']*3,0)

In [65]:
team_df['Pts']=team_df['Pts']+np.where(team_df['G']>0,np.where(np.logical_or(team_df['pos']==1,team_df['pos']==2),6*team_df['G'],np.where(team_df['pos']==3,team_df['G']*5,team_df['G']*4)),0)

Output team dataframe showing points for each player.

In [66]:
team_df

Unnamed: 0,id,pos,team,opp,home,opp_att,def,MP,I,C,T,KP90,Sh90,G,CS,A,Pts
0,131,1,7,18,1,1120,1040,90.0,24.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,48,1,4,20,0,1110,1130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,258,2,13,16,0,1010,1030,90.0,15.8,0.24,9.0,0.0,0.6,0.0,0.0,0.0,2.0
3,407,2,20,4,1,1160,1080,90.0,20.64,6.44,12.0,0.6,1.2,0.0,0.0,1.0,5.0
4,182,2,10,3,1,1070,1340,105.4,32.88,45.02,11.6,2.318,1.706,0.0,0.0,1.0,5.0
5,128,2,7,18,1,1120,1040,75.5,12.3,0.1,0.0,0.0,0.0,0.0,0.0,0.0,2.0
6,65,2,3,10,0,1300,1180,90.0,18.75,10.725,1.5,0.5,0.25,0.0,0.0,1.0,5.0
7,301,3,15,14,1,1000,1060,90.0,17.8,22.44,19.4,1.2,1.6,0.0,0.0,1.0,5.0
8,191,3,10,3,1,1070,1340,106.8,48.04,30.5,96.4,2.106,4.518,1.0,0.0,1.0,10.0
9,463,3,6,8,1,1190,1310,69.2,12.88,22.8,20.0,1.614,1.778,0.0,0.0,1.0,5.0


output players to captain and vice captain, and total team points for gameweek

In [67]:
print('Pts: ',team_df['Pts'].sum(),'\n','Captain and Vice Captain: ',team_df.nlargest(2,'Pts')['id'].tolist())

Pts:  75.0 
 Captain and Vice Captain:  [191, 215]
