In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA

# Data Extraction Class

In [2]:
class xFormBase:
    def __init__(self, gw, season, form_range=4):
        self.gw = gw
        self.range = form_range
        self.season = season
        self.form_data = self.get_form_data()

    def get_form_data(self):
        data = pd.DataFrame()
        for i in range(self.range):
            url = self.get_url(i + 1)
            temp = pd.read_csv(url)
            temp = temp[['name', 'minutes', 'value', 'position', 'team', 'selected', 'bonus', 'threat', 'creativity',
                         'influence', 'xP', 'bps', 'ict_index', 'expected_goal_involvements', 'total_points',
                         'expected_goals', 'expected_assists', 'saves', 'expected_goals_conceded']]
            temp = temp.set_index('name').groupby(['name', 'position', 'team']).mean()
            data = pd.concat([data, temp])
        data = data.groupby(['name', 'position', 'team']).mean().reset_index(['position'])
        return data

    def get_url(self, lag):
        return 'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/20' \
               + str(self.season) + '-' + str(self.season + 1) + '/gws/gw' + str(self.gw - lag) + '.csv'
    
    def calculate_team_defence(self): 
        data = []
        for n in range(self.range):
            temp = pd.read_csv(self.get_url(n+1))
            temp = temp[['team', 'expected_goals_conceded', 'minutes']]
            temp = temp[temp.minutes == 90]
            temp = temp.drop('minutes', axis=1).groupby('team').mean()
            data.append(temp) 
        return pd.concat([data[i] for i in range(self.range)]).groupby('team').mean().expected_goals_conceded
            
    def calculate_team_attack(self):
        data = []
        for n in range(self.range):
            temp = pd.read_csv(self.get_url(n+1))
            temp = temp[['name','team', 'expected_goals']]
            temp = temp.groupby(['name', 'team']).mean()
            temp = temp.groupby(['team']).sum()
            data.append(temp)   
        return pd.concat([data[i] for i in range(self.range)]).groupby('team').mean().expected_goals

class xFormTrain(xFormBase):
    def __init__(self, gw, season):
        super().__init__(gw, season)
        self.gw_data = self.get_gw_data()
        self.df = self.join_and_refine()

    def get_gw_data(self):
        url = self.get_url(0)
        data = pd.read_csv(url)
        data = data[['name', 'team', 'was_home', 'total_points', 'opponent_team']]
        data = data.rename(columns={'total_points': 'points_scored'})
        team_defence = self.calculate_team_defence()
        team_attack = self.calculate_team_attack()
        data['opponent_defence'] = data.opponent_team.apply(lambda x: team_defence[x - 1])
        data['opponent_attack'] = data.opponent_team.apply(lambda x: team_attack[x - 1])
        data['team_defence'] = data.team.apply(lambda x: team_defence[x])
        data = data.drop(['opponent_team'], axis=1).groupby(['name', 'team']).mean()
        return data

    def join_and_refine(self):
        df = self.form_data.join(self.gw_data, on=['name', 'team'])
        df_refined = df[(df.minutes > 60)]
        df_refined = df_refined[(df_refined.expected_goals + df_refined.expected_assists) > 0.0]
        df_refined = df_refined[(df_refined.points_scored > 0)]
        df_refined = df_refined[df_refined.points_scored.isnull() == False]
                                
        return df_refined
                            


class xFormFixture(xFormBase):
    def __init__(self, gw, value, season=23):
        super().__init__(gw, season)
        self.value = value
        self.df = self.get_fixture_data()

    def get_fixture_data(self):
        fixtures = pd.read_csv('/kaggle/input/prem-fixtures/fixtures.csv') \
            [['Round Number', 'Home Team', 'Away Team']] \
            .replace({'Nottingham Forest': 'Forest'})
        fixtures = fixtures[(fixtures['Round Number'] == self.gw)]
        form = self.form_data.reset_index(['team']).replace({'Nott\'m Forest': 'Forest'})
        dictionary = {}
        for i, team in enumerate(form.team):
            if (fixtures['Home Team'] == team).any():
                home = 1
                oppo = fixtures[(fixtures['Home Team'] == team)]['Away Team'].item()
                dictionary[i] = []
                dictionary[i].append(home)
                dictionary[i].append(oppo)
            else:
                home = 0
                oppo = fixtures[(fixtures['Away Team'] == team)]['Home Team'].item()
                dictionary[i] = []
                dictionary[i].append(home)
                dictionary[i].append(oppo)
        form.set_index('team')
        df = pd.DataFrame(dictionary)
        df = df.transpose().set_index(form.index)
        df.columns = ['was_home', 'oppo']
        data = form.join(df, on=['name'])
        data = data.replace({'Forest': 'Nott\'m Forest'})
        team_defence = self.calculate_team_defence()
        team_attack = self.calculate_team_attack()
        data['opponent_defence'] = data.oppo.apply(lambda x: team_defence[x])
        data['opponent_attack'] = data.oppo.apply(lambda x: team_attack[x])
        data['team_defence'] = data.team.apply(lambda x: team_defence[x])
        data = data.drop(['oppo', 'team'], axis=1)
        data = data[data.value < self.value]
        data = data[(data.minutes > 60)]
        data = data[(data.expected_goals + data.expected_assists) > 0.0]
        return data

# Create Dataset

In [3]:
def get_dataset(position, data, pred=False):
    if pred:
        if position == 'GK':
            pos_data = data.loc[data.position == position][['xP', 'saves', 'opponent_attack', 'team_defence', 'was_home']]
        elif position == 'DEF':
            pos_data = data.loc[data.position == position][['xP', 'opponent_attack','expected_goal_involvements','ict_index', 'team_defence','total_points','was_home']]  
        else:
            pos_data = data.loc[data.position == position][['xP', 'bps', 'expected_goal_involvements', 'ict_index', 'opponent_defence', 'was_home']]
    else:
        if position == 'GK':
            pos_data = data.loc[data.position == position][['xP', 'saves', 'opponent_attack', 'team_defence', 'was_home', 'points_scored']]
        elif position == 'DEF':
            pos_data = data.loc[data.position == position][['xP', 'opponent_attack','expected_goal_involvements','ict_index', 'team_defence','total_points','was_home', 'points_scored']]  
        else:
            pos_data = data.loc[data.position == position][['xP', 'bps', 'expected_goal_involvements', 'ict_index', 'opponent_defence', 'was_home', 'points_scored']]
    X = pos_data
    X_dummies = pd.get_dummies(X['was_home'], prefix='was_home')
    if not pred: 
        column_mapping = {
        'was_home_1.0': 'home',
        'was_home_0.0': 'away',
        'was_home_0.5' : 'both'
        }
        X_dummies = X_dummies.rename(columns=column_mapping)
        X = pd.concat([X, X_dummies], axis=1).drop(['was_home','both'], axis=1)
    else:
        column_mapping = {
        'was_home_1':'home',
        'was_home_0':'away'
        }
        X_dummies = X_dummies.rename(columns=column_mapping)
        X = pd.concat([X, X_dummies], axis=1).drop(['was_home'], axis=1)
        
    return X

In [4]:
data = xFormTrain(21, 22).df
for i in range(17):
    data = pd.concat([data, xFormTrain(22+i, 22).df])
for i in range(5):
    data = pd.concat([data, xFormTrain(5+i, 23).df])

In [5]:
X = get_dataset('DEF', data)
y = X.pop('points_scored')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
len(X)

1149

# Fit the Model

In [7]:
def fit_and_test(X_train, X_val, Y_train, Y_val, model): 
    model.fit(X_train, Y_train)
    predictions = pd.DataFrame(model.predict(X_val), index=X_val.index, columns=['xP'])
    return mean_absolute_error(predictions, Y_val)
    
def fit_pipe(X, y, transformer, model):
    pipe = Pipeline(steps=[('preprocessor', transformer),
                          ('regressor', model)])
    scores = -1 * cross_val_score(pipe, X, y, cv=3, scoring='neg_mean_absolute_error') 
    score = scores.mean()
    print(f'Cross Val Score {score}')
    pipe.fit(X, y)
    return pipe

In [8]:
model = Ridge()

In [9]:
fit_and_test(X_train, X_test, y_train, y_test, model)

2.3310142013508077

In [10]:
pipe = fit_pipe(X, y, None, model)

Cross Val Score 2.381635758085005


# Pricipal Component Analysis

In [11]:
def pc_transform(X, pca, ss):
    X_pca = pca.transform(ss.transform(X))
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    return X_pca

def pc_fit_transform(X):
    ss = StandardScaler()
    pca = PCA()
    X_pca = pca.fit_transform(ss.fit_transform(X))
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    loadings = pd.DataFrame(
        pca.components_.T,
        columns=component_names, 
        index=X.columns,
    )
    return ss, pca, X_pca, loadings

In [12]:
ss, pca, X_pca, loadings = pc_fit_transform(X)
X_pca = X_pca.loc[:, :'PC7']
fit_pipe(X_pca, y, None, model)

Cross Val Score 2.38127191807989


# Predictions

In [13]:
teams = ['Arsenal', 'Aston Villa','Bournemouth','Brentford', 
         'Brighton', 'Burnley', 'Chelsea','Crystal Palace',
         'Everton','Fulham','Liverpool','Luton', 'Man City',
         'Man Utd', 'Newcastle','Forest', 
         'Sheffield Utd','Spurs','West Ham', 'Wolves']

In [14]:
pred_data = xFormFixture(gw=10, value=550).df

In [15]:
X_pred = get_dataset('FWD', pred_data, pred=True)

In [16]:
pred = pd.DataFrame(pipe.predict(X_pred), index=X_pred.index, columns=['xP'])

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- away
- bps
- expected_goal_involvements
- home
- ict_index
- ...
Feature names seen at fit time, yet now missing:
- PC1
- PC2
- PC3
- PC4
- PC5
- ...


In [None]:
pred.sort_values(by=['xP'],ascending=False)