# Premier League 2024/25 EDA

This notebook performs exploratory data analysis on the English Premier League 2024/25 season dataset. We inspect the raw match data, engineer features using an Elo rating system and recent form statistics, and visualize distributions and correlations.


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_preprocessing import create_feature_df

# Configure matplotlib for Jupyter
%matplotlib inline


In [None]:

# Load raw match results
raw_path = '../data/premier_league_2024_2025.csv'
df = pd.read_csv(raw_path)
df.head()


In [None]:

print('Number of matches:', len(df))
print('Columns:', df.columns.tolist())
# Parse scores to compute goal differences
from src.data_preprocessing import parse_score

outcomes = []
for res in df['Result']:
    h, a = parse_score(res)
    outcomes.append('Home Win' if h > a else ('Draw' if h == a else 'Away Win'))

# Distribution of outcomes
pd.Series(outcomes).value_counts()


In [None]:

import seaborn as sns
sns.countplot(x=outcomes)
plt.title('Distribution of Match Outcomes (2024/25 Season)')
plt.xlabel('Outcome')
plt.ylabel('Count')
plt.show()


In [None]:

features, target, feature_names, label_map = create_feature_df(raw_path)
print('Engineered feature columns:', feature_names)
features.head()


In [None]:

plt.figure(figsize=(8,6))
sns.heatmap(features.corr(), cmap='coolwarm', annot=True)
plt.title('Correlation Matrix of Engineered Features')
plt.show()
