# Game week points prediction model

In [1]:
import pandas as pd
import requests
import os
import json
import datetime
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

## Load and clean fixture dataset

In [2]:
def fetch_fixtures():
    url = 'https://fantasy.premierleague.com/api/fixtures/'
    response = requests.get(url)
    data = response.json()
    return data

def fetch_teams():
    url = 'https://fantasy.premierleague.com/api/bootstrap-static/'
    response = requests.get(url)
    data = response.json()
    teams = {team['id']: team['name'] for team in data['teams']}
    return teams

fixtures = fetch_fixtures()
teams = fetch_teams()

fixtures_df = pd.DataFrame(fixtures)
teams_df = pd.DataFrame(list(teams.items()), columns=['team_id', 'team'])

fixtures_df['team_h'] = fixtures_df['team_h'].map(teams)
fixtures_df['team_a'] = fixtures_df['team_a'].map(teams)

team_h_difficulty = fixtures_df.groupby('team_h')['team_h_difficulty'].mean().reset_index()
team_a_difficulty = fixtures_df.groupby('team_a')['team_a_difficulty'].mean().reset_index()

team_h_difficulty.columns = ['team', 'avg_home_difficulty']
team_a_difficulty.columns = ['team', 'avg_away_difficulty']

team_difficulty = pd.merge(team_h_difficulty, team_a_difficulty, on='team', how='outer')

team_difficulty['avg_difficulty'] = team_difficulty[['avg_home_difficulty', 'avg_away_difficulty']].mean(axis=1)

team_difficulty = team_difficulty.merge(teams_df, on="team")

team_difficulty_mapping = team_difficulty.set_index('team_id')['avg_difficulty'].to_dict()

fixtures_df['opponent_team_difficulty'] = fixtures_df['team_a'].map(team_difficulty_mapping)

## Remove columns not available at test time

In [None]:
columns_to_drop = [
    'goals_scored', 'assists', 'clean_sheets', 
    'goals_conceded', 'own_goals', 'penalties_missed', 'penalties_saved', 
    'red_cards', 'saves', 'team_a_score', 'team_h_score', 
    'yellow_cards', 'round', 'kickoff_time', 'selected', 
    'transfers_in', 'transfers_out',
    'expected_assists', 'expected_goal_involvements', 'expected_goals',
    'expected_goals_conceded', 'creativity', 'influence', 'bonus',
    'bps', 'minutes', 'xP', 'element', 'fixture', 'threat'
]

## Scale up
Merge all 38 game week datasets

In [None]:
# gw_df_list = []
# for gw in range(1,39):
#     player_data = pd.read_csv(f'gw{gw}.csv')
#     gw_df_list.append(player_data)

# gw_all = pd.concat(gw_df_list)

In [None]:
# gw_all_with_difficulty = pd.merge(gw_all, team_difficulty, left_on='team', right_on='team', how='left')
# gw_all_with_difficulty['opponent_team_difficulty'] = gw_all_with_difficulty['opponent_team'].map(team_difficulty_mapping)
# gw_all_with_difficulty.to_csv('gw_all_with_difficulty.csv', index=False)
# cleaned_gw_all = gw_all_with_difficulty.drop(columns=columns_to_drop)
cleaned_gw_all = pr.read_csv("data/cleaned_gw_all.csv")

## Exploratory data analysis

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(cleaned_gw_all['total_points'], bins=20, kde=True)
plt.xlabel('Total Points')
plt.ylabel('Frequency')
plt.title('Distribution of Total Points')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(x='position', y='ict_index', data=cleaned_gw_all)
plt.xlabel('Position')
plt.ylabel('ICT Index')
plt.title('ICT Index Distribution by Position')
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
team_avg_points = cleaned_gw_all.groupby('team')['total_points'].mean().sort_values()
sns.barplot(x=team_avg_points.index, y=team_avg_points.values, palette='viridis')
plt.xlabel('Team')
plt.ylabel('Average Total Points')
plt.title('Average Total Points by Team')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='avg_difficulty', y='total_points', data=cleaned_gw_all)
sns.regplot(x='avg_difficulty', y='total_points', data=cleaned_gw_all, scatter=False, color='red')
plt.xlabel('Average Difficulty')
plt.ylabel('Total Points')
plt.title('Total Points vs Average Difficulty')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='was_home', y='total_points', data=cleaned_gw_all)
plt.xlabel('Was Home')
plt.ylabel('Total Points')
plt.title('Total Points Distribution: Home vs Away')
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
corr_matrix = cleaned_gw_all[['total_points', 'ict_index', 'transfers_balance', 'value',
                              'avg_home_difficulty', 'avg_away_difficulty', 'avg_difficulty',
                              'opponent_team_difficulty']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(x='position', y='value', data=cleaned_gw_all)
plt.xlabel('Position')
plt.ylabel('Value')
plt.title('Distribution of Player Value by Position')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='transfers_balance', y='total_points', data=cleaned_gw_all)
plt.xlabel('Transfers Balance')
plt.ylabel('Total Points')
plt.title('Total Points vs Transfers Balance')
plt.grid(True)
plt.show()


## Data preprocessing and modelling

In [None]:
cleaned_gw_all.columns

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for column in ['position', 'team']:
    le = LabelEncoder()
    cleaned_gw_all[column] = le.fit_transform(cleaned_gw_all[column])
    label_encoders[column] = le

cleaned_gw_all = cleaned_gw_all.dropna()

X = cleaned_gw_all.drop(['name', 'total_points'], axis=1)
y = cleaned_gw_all['total_points']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

In [None]:
y_test

In [None]:
y_pred

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)  # Diagonal line
plt.xlabel('Actual Total Points')
plt.ylabel('Predicted Total Points')
plt.title('Actual vs Predicted Total Points')
plt.grid(True)

In [None]:
# Get feature importances
feature_importances = rf_model.feature_importances_

# Create a DataFrame for plotting
feature_importances_df = pd.DataFrame({
    'feature': X.columns,
    'importance': feature_importances
})

# Sort by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Plot
plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importances_df)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importance for Predicting Total Points')
plt.show()