<a href="https://www.kaggle.com/code/oscarfraley/xfantasy?scriptVersionId=154725893" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import seaborn as sns

# Data Extraction Class

- All data is aquired from https://github.com/vaastav/Fantasy-Premier-League.
- Raw data is manipulated into a dataframe whereby each index is a player from a specific gameweek and features contain averages of their form (xG, xA ...), infromation on the particular fixture (opposition average xGC form, home/away) as well as their true gameweek points scored (target). 

In [2]:
class xFormBase:
    def __init__(self, gw, season, form_range):
        self.gw = gw
        self.range = form_range
        self.season = season
        self.form_data = self.get_form_data()

    def get_form_data(self):
        data = pd.DataFrame()
        for i in range(self.range):
            url = self.get_url(i+1)
            temp = pd.read_csv(url)
            temp = temp[['name', 'minutes', 'value', 'position', 'team', 'selected', 'bonus', 'threat', 'creativity',
                         'influence', 'xP', 'bps', 'ict_index', 'expected_goal_involvements', 'total_points',
                         'expected_goals', 'expected_assists', 'saves', 'expected_goals_conceded']]
            temp = temp.set_index('name').groupby(['name', 'position', 'team']).mean()
            data = pd.concat([data, temp])
        data = data.groupby(['name', 'position', 'team']).mean().reset_index(['position'])
        return data

    def get_url(self, lag):
        return 'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/20' \
               + str(self.season) + '-' + str(self.season + 1) + '/gws/gw' + str(self.gw - lag) + '.csv'
    
    def calculate_team_defence(self): 
        data = []
        for n in range(self.range):
            temp = pd.read_csv(self.get_url(n+1))
            temp = temp[['team', 'expected_goals_conceded', 'minutes']]
            temp = temp[temp.minutes == 90]
            temp = temp.drop('minutes', axis=1).groupby('team').mean()
            data.append(temp) 
        return pd.concat([data[i] for i in range(self.range)]).groupby('team').mean().expected_goals_conceded
            
    def calculate_team_attack(self):
        data = []
        for n in range(self.range):
            temp = pd.read_csv(self.get_url(n+1))
            temp = temp[['name','team', 'expected_goals']]
            temp = temp.groupby(['name', 'team']).mean()
            temp = temp.groupby(['team']).sum()
            data.append(temp)   
        return pd.concat([data[i] for i in range(self.range)]).groupby('team').mean().expected_goals

class xFormTrain(xFormBase):
    def __init__(self, gw, season, form_range):
        super().__init__(gw, season, form_range)
        self.gw_data = self.get_gw_data()
        self.df = self.join_and_refine()

    def get_gw_data(self):
        url = self.get_url(0)
        data = pd.read_csv(url)
        data = data[['name', 'team', 'was_home', 'total_points', 'opponent_team']]
        data = data.rename(columns={'total_points': 'points_scored'})
        team_defence = self.calculate_team_defence()
        team_attack = self.calculate_team_attack()
        data['opponent_defence'] = data.opponent_team.apply(lambda x: team_defence[x - 1])
        data['opponent_attack'] = data.opponent_team.apply(lambda x: team_attack[x - 1])
        data['team_defence'] = data.team.apply(lambda x: team_defence[x])
        data = data.drop(['opponent_team'], axis=1).groupby(['name', 'team']).mean()
        return data

    def join_and_refine(self):
        df = self.form_data.join(self.gw_data, on=['name', 'team'])
        df_refined = df[(df.minutes > 60)]
        df_refined = df_refined[(df_refined.points_scored > 0)]
        df_refined = df_refined[df_refined.points_scored.isnull() == False]                    
        return df_refined
                            
class xFormFixture(xFormBase):
    def __init__(self, position, gw, form_range, value, season=23):
        super().__init__(gw, season, form_range)
        self.value = value
        self.position = position
        self.df = self.get_fixture_data()

    def get_fixture_data(self):
        fixtures = pd.read_csv('/kaggle/input/prem-fixtures/fixtures.csv') \
            [['Round Number', 'Home Team', 'Away Team']] \
            .replace({'Nottingham Forest': 'Forest'})
        fixtures = fixtures[(fixtures['Round Number'] == self.gw)]
        form = self.form_data.reset_index(['team']).replace({'Nott\'m Forest': 'Forest'})
        dictionary = {}
        for i, team in enumerate(form.team):
            if (fixtures['Home Team'] == team).any():
                home = 1
                oppo = fixtures[(fixtures['Home Team'] == team)]['Away Team'].item()
                dictionary[i] = []
                dictionary[i].append(home)
                dictionary[i].append(oppo)
            else:
                home = 0
                oppo = fixtures[(fixtures['Away Team'] == team)]['Home Team'].item()
                dictionary[i] = []
                dictionary[i].append(home)
                dictionary[i].append(oppo)
        form.set_index('team')
        df = pd.DataFrame(dictionary)
        df = df.transpose().set_index(form.index)
        df.columns = ['was_home', 'oppo']
        data = form.join(df, on=['name'])
        data = data.replace({'Forest': 'Nott\'m Forest'})
        team_defence = self.calculate_team_defence()
        team_attack = self.calculate_team_attack()
        data['opponent_defence'] = data.oppo.apply(lambda x: team_defence[x])
        data['opponent_attack'] = data.oppo.apply(lambda x: team_attack[x])
        data['team_defence'] = data.team.apply(lambda x: team_defence[x])
        data = data.drop(['oppo', 'team'], axis=1)
        data = data[data.value < self.value]
        data = data[(data.minutes > 60)]
        data = data[data.position == self.position]
        return data

The two inherited classes differ by only a few functions. xFormTrain extracts the dataset to be used in training. xFormFixture extracts data for an up and coming gameweek in order to make prediction, the caveat being that the online dataset doesnt contain information about future matches.

# Create Dataset

Now we contruct the dataset by concatenating multiple instances of the data extraction class.

In [3]:
data = xFormTrain(gw=21, season=22, form_range=4).df
for i in range(17):
    data = pd.concat([data, xFormTrain(gw=22+i, season=22, form_range=4).df])
for i in range(12):
    data = pd.concat([data, xFormTrain(gw=5+i, season=23, form_range=4).df])

In [4]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,position,minutes,value,selected,bonus,threat,creativity,influence,xP,bps,...,total_points,expected_goals,expected_assists,saves,expected_goals_conceded,was_home,points_scored,opponent_defence,opponent_attack,team_defence
name,team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Aaron Ramsdale,Arsenal,GK,90.0,49.0,1326229.25,0.75,0.0,0.0,26.2,4.05,21.75,...,5.5,0.0,0.00371,3.5,1.208575,1.0,1.0,0.61307,2.14855,1.208575
Aaron Wan-Bissaka,Man Utd,DEF,84.5,43.0,56941.5,0.75,5.25,16.0125,15.2,7.5,23.0,...,5.75,0.004463,0.059899,0.0,0.589325,0.0,1.0,1.208575,1.9542,0.61307
Adam Smith,Bournemouth,DEF,60.5,44.0,26229.75,0.0,2.0,4.875,6.85,0.55,6.0,...,0.5,0.0,0.03007,0.0,1.11845,1.0,2.0,1.20575,1.366825,1.7278
Aleksandar Mitrović,Fulham,FWD,77.25,70.25,3074294.25,0.75,42.75,8.5125,20.275,8.05,10.375,...,4.375,0.754337,0.055465,0.0,1.006563,1.0,2.0,1.508383,1.016563,1.306787
Alex Iwobi,Everton,MID,90.0,55.0,107730.0,0.0,11.5,14.525,10.85,1.9,11.25,...,2.5,0.103975,0.03397,0.0,1.5487,0.0,2.0,1.511475,1.210325,1.533588


## Model Class

In [5]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA

Different positions in FPL require different features to predict their scores. 

In [6]:
gk_features = ['bps', 'ict_index','saves','opponent_attack','team_defence','was_home']
def_features = ['opponent_attack','opponent_defence','expected_goals', 'expected_assists','ict_index','team_defence','was_home', 'bps']
#att_features = ['expected_goals', 'expected_assists', 'threat', 'creativity', 'influence', 'opponent_defence', 'opponent_attack', 'was_home']
att_features = ['total_points','expected_goal_involvements','was_home','opponent_defence', 'threat', 'opponent_attack', 'bps']

In [7]:
class model_class:
    def __init__(self, data, position, regressor, transformer=None):
        self.position = position
        self.data = data
        self.regressor = regressor
        self.transformer = transformer
        self.train_data = self.get_dataset()
        self.pipe = self.fit_pipe()
        
    def get_dataset(self, pred=False):
        
        if not pred:
            if self.position == 'GK':
                pos_data = self.data.loc[self.data.position == self.position][[gk_features] + ['points_scored']]
            elif self.position == 'DEF':
                pos_data = self.data.loc[self.data.position == self.position][[def_features] + ['points_scored']]
            else:
                pos_data = self.data.loc[self.data.position == self.position][[att_features] + ['points_scored']]
        else:
            if self.position == 'GK':
                pos_data = self.pred_data.loc[self.pred_data.position == self.position][gk_features]
            elif self.position == 'DEF':
                pos_data = self.pred_data.loc[self.pred_data.position == self.position][def_features]  
            else:
                pos_data = self.pred_data.loc[self.pred_data.position == self.position][att_features]
        X = pos_data
        X_dummies = pd.get_dummies(X['was_home'], prefix='was_home')
        if not pred: 
            column_mapping = {
            'was_home_1.0': 'home',
            'was_home_0.0': 'away',
            'was_home_0.5' : 'both'
            }
            X_dummies = X_dummies.rename(columns=column_mapping)
            X = pd.concat([X, X_dummies], axis=1).drop(['was_home','both'], axis=1)
        else:
            column_mapping = {
            'was_home_1':'home',
            'was_home_0':'away'
            }
            X_dummies = X_dummies.rename(columns=column_mapping)
            X = pd.concat([X, X_dummies], axis=1).drop(['was_home'], axis=1)
        return X
    
    def cross_val_score(self):
        
        X = self.train_data
        y = X.pop('points_scored')
        pipe = Pipeline(steps=[('preprocessor', self.transformer),
                              ('regressor', self.regressor)])
        scores = -1 * cross_val_score(pipe, X, y, cv=15, scoring='neg_mean_absolute_error') 
        score = scores.mean()
        
    def fit_pipe(self):
        X = self.train_data
        y = X.pop('points_scored')
        pipe = Pipeline(steps=[('preprocessor', self.transformer),
                              ('regressor', self.regressor)])
        scores = -1 * cross_val_score(pipe, X, y, cv=15, scoring='neg_mean_absolute_error') 
        score = scores.mean()
        print(f'Cross Val Score {score}')
        pipe.fit(X, y)
        return pipe
    
    def predictions(self, value, gw, form_range):
        self.pred_data = xFormFixture(self.position, gw=gw, form_range=form_range, value=value).df
        self.X_pred = self.get_dataset(pred=True)
        pred = pd.DataFrame(self.pipe.predict(self.X_pred), index=self.X_pred.index, columns=['xP'])
        return pred.sort_values(by=['xP'],ascending=False)

In [8]:
class ModelClass:
    def __init__(self, data, position, regressor, transformer=None):
        self.position = position
        self.data = data
        self.regressor = regressor
        self.transformer = transformer
        self.train_data = self.get_dataset()
        self.pipe = self._initialize_pipeline()

    def _initialize_pipeline(self):
        pipe = Pipeline(steps=[('preprocessor', self.transformer),
                       ('regressor', self.regressor)]) 
        return pipe

    def get_dataset(self, pred=False):
        if not pred:
            if self.position == 'GK':
                pos_data = self.data.loc[self.data.position == self.position][gk_features + ['points_scored']]
            elif self.position == 'DEF':
                pos_data = self.data.loc[self.data.position == self.position][def_features + ['points_scored']]
            else:
                pos_data = self.data.loc[self.data.position == self.position][att_features + ['points_scored']]
        else:
            if self.position == 'GK':
                pos_data = self.pred_data.loc[self.pred_data.position == self.position][gk_features]
            elif self.position == 'DEF':
                pos_data = self.pred_data.loc[self.pred_data.position == self.position][def_features]  
            else:
                pos_data = self.pred_data.loc[self.pred_data.position == self.position][att_features]
                
        X = pos_data
        X_dummies = pd.get_dummies(X['was_home'], prefix='was_home')
        if not pred: 
            column_mapping = {
            'was_home_1.0': 'home',
            'was_home_0.0': 'away',
            'was_home_0.5' : 'both'
            }
            X_dummies = X_dummies.rename(columns=column_mapping)
            X = pd.concat([X, X_dummies], axis=1).drop(['was_home','both'], axis=1)
        else:
            column_mapping = {
            'was_home_1':'home',
            'was_home_0':'away'
            }
            X_dummies = X_dummies.rename(columns=column_mapping)
            X = pd.concat([X, X_dummies], axis=1).drop(['was_home'], axis=1)
        return X

    def cross_val_score(self):
        X = self.train_data.copy()
        y = X.pop('points_scored')
        pipe = self._initialize_pipeline()
        scores = -1 * cross_val_score(pipe, X, y, cv=15, scoring='neg_mean_absolute_error')
        return scores.mean()
        
    def fit_pipe(self):
        X = self.train_data.copy()
        y = X.pop('points_scored')
        self.pipe.fit(X, y)
        

    def predictions(self, value, gw, form_range):
        self.pred_data = xFormFixture(self.position, gw=gw, form_range=form_range, value=value).df
        self.X_pred = self.get_dataset(pred=True)
        pred = pd.DataFrame(self.pipe.predict(self.X_pred), index=self.X_pred.index, columns=['xP'])
        return pred.sort_values(by=['xP'],ascending=False)

## Fit and Predict

In [9]:
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [10]:
teams = ['Arsenal', 'Aston Villa','Bournemouth','Brentford', 
         'Brighton', 'Burnley', 'Chelsea','Crystal Palace',
         'Everton','Fulham','Liverpool','Luton', 'Man City',
         'Man Utd', 'Newcastle','Forest', 
         'Sheffield Utd','Spurs','West Ham', 'Wolves']

In [11]:
regressors = [ElasticNet(), Lasso(), Ridge()]
transformers = [StandardScaler(), PowerTransformer(), None]

In [12]:
best_score = 0
for regressor in regressors:
    for transformer in transformers:
        model = ModelClass(data, 'DEF', regressor, transformer)
        score = model.cross_val_score()
        if score > best_score:
            best_model = model
            best_model.fit_pipe()
        print(f'Model: {regressor}, transformer: {transformer} got score: {score}')

Model: ElasticNet(), transformer: StandardScaler() got score: 2.4230414771838116
Model: ElasticNet(), transformer: PowerTransformer() got score: 2.4230414771838116
Model: ElasticNet(), transformer: None got score: 2.404868310057091
Model: Lasso(), transformer: StandardScaler() got score: 2.4230414771838116
Model: Lasso(), transformer: PowerTransformer() got score: 2.4230414771838116
Model: Lasso(), transformer: None got score: 2.4129184471758665
Model: Ridge(), transformer: StandardScaler() got score: 2.348831682020236
Model: Ridge(), transformer: PowerTransformer() got score: 2.357124872046315
Model: Ridge(), transformer: None got score: 2.3487467844432053


In [13]:
best_model.predictions(150, 17,4).head(15)

Unnamed: 0_level_0,xP
name,Unnamed: 1_level_1
Trent Alexander-Arnold,5.48135
Marcos Senesi,4.945671
Adam Smith,4.71964
Milos Kerkez,4.509155
Illia Zabarnyi,4.400536
Kieran Trippier,4.230686
Oleksandr Zinchenko,4.137729
Pedro Porro,4.104536
Ben Mee,4.098292
Vladimír Coufal,4.068153


In [14]:
best_model.X_pred.loc['Milos Kerkez']

opponent_attack      0.475
opponent_defence    2.2975
expected_goals        0.02
expected_assists      0.03
ict_index             2.95
team_defence        0.8725
bps                   18.0
away                 False
home                  True
Name: Milos Kerkez, dtype: object