In [114]:
import pandas as pd
import sqlite3
import sys
sys.path.append("..")

In [115]:
color_map = pd.read_csv("../src/fpl/pipelines/model_pipeline/team_mapping.csv")
color_map = color_map.set_index("FBREF_NAME")["PRIMARY_COLOR"].to_dict()

In [116]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

def plot_elos(team_df, x_axis, y_axis, color_by, tooltips):
    fig = make_subplots(rows=1, cols=3, shared_yaxes=True, subplot_titles=x_axis, y_title=y_axis,)
    for idx, axis in enumerate(x_axis):
        fig.add_trace(
            go.Scatter(x=team_df[axis], 
                       y=team_df[y_axis], 
                       marker_color=team_df[color_by], 
                       text=team_df[tooltips].apply(lambda x: '<br>'.join(str(x)), axis=1),
                       showlegend=False,
                       ),
            row=1, col=idx+1)
    fig.update_traces(mode='markers')
    fig.update_layout(height=400, width=1200)
    fig.show()

In [117]:
with sqlite3.connect('../data/fpl.db') as con:
    processed_data = pd.read_sql_query('select * from "02_PROCESSED_DATA"', con)
team_df = processed_data[processed_data["TEAM"]=="Manchester City"].copy()
team_df.loc[team_df.index,"OPPONENT_COLOR"] = team_df["OPPONENT"].map(lambda x: color_map[x])
plot_elos(team_df, ["ATT_TOTAL", "HOME_ATT_TOTAL", "AWAY_ATT_TOTAL"], "XG", "OPPONENT_COLOR", ["OPPONENT", "SEASON", "ROUND"])

In [118]:
import seaborn as sns
color_pal = sns.color_palette()
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [119]:
copy_df = train_data[["ATT_TOTAL", "XG"]].copy()
copy_df['ATT_TOTAL_bin'] = pd.cut(copy_df['ATT_TOTAL'], bins=np.arange(0,4, 0.5))
plt.figure(figsize=(12, 6))
sns.boxplot(x='ATT_TOTAL_bin', y='XG', data=copy_df)
plt.show()

NameError: name 'train_data' is not defined

In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import numpy as np

numerical_features = ['ROUND', 'POSS', 'DAYS_TILL_NEXT', 'DAYS_SINCE_LAST', 'ATT_TOTAL', 'HOME_ATT_TOTAL', 'AWAY_ATT_TOTAL'] 
# 'DEF_TOTAL', 'HOME_DEF_TOTAL', 'AWAY_DEF_TOTAL', 'DATE', 'XGA', ]
categorical_features = ['VENUE', 'TEAM', 'OPPONENT']
target = "XG"

train_val_data = processed_data[processed_data['SEASON'] < "2021-2022"]
X_train_val = train_val_data[numerical_features+categorical_features]
y_train_val = train_val_data[target]

holdout_data = processed_data[processed_data['SEASON'] >= "2021-2022"]
X_holdout  = holdout_data[numerical_features+categorical_features]
y_holdout  = holdout_data[target]

groups = train_val_data["SEASON"]
n_splits = groups.nunique()
print(f"{groups.unique() = }")
group_kfold = GroupKFold(n_splits=n_splits)

X_train_val_cat = X_train_val[categorical_features]
categories = [np.append(X_train_val_cat[col].unique(), 'Unknown') for col in X_train_val_cat.columns]
encoder = OneHotEncoder(handle_unknown='infrequent_if_exist', categories=categories, min_frequency=1)
encoder.fit(X_train_val_cat)
encoded_cat_cols = encoder.get_feature_names_out(input_features=categorical_features)


model = XGBRegressor(
    base_score=0.5,
    n_estimators=1000,
    early_stopping_rounds=50,
    objective ='reg:squarederror', 
    learning_rate=0.01,
    eval_metric='mae', 
    seed=42
    )

cross_val_scores = []
for train_index, val_index in group_kfold.split(X_train_val, y_train_val, groups):
    X_train_cat, X_val_cat = X_train_val.iloc[train_index][categorical_features], X_train_val.iloc[val_index][categorical_features]
    X_train_num, X_val_num = X_train_val.iloc[train_index][numerical_features], X_train_val.iloc[val_index][numerical_features]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

    X_train_encoded = np.hstack([
        encoder.transform(X_train_cat).toarray(),
        X_train_num
    ])
    X_val_encoded = np.hstack([
        encoder.transform(X_val_cat).toarray(),
        X_val_num
    ])

    model.fit(
        X_train_encoded, y_train,
        eval_set=[(X_train_encoded, y_train), (X_val_encoded, y_val)],
        verbose=100
        )

    val_predictions = model.predict(X_val_encoded)
    val_accuracy = mean_squared_error(y_val, val_predictions)
    cross_val_scores.append(val_accuracy)


avg_cv_accuracy = sum(cross_val_scores) / n_splits
print(f'Average cross-validation accuracy: {avg_cv_accuracy}')
print(cross_val_scores)

In [None]:
fi = pd.DataFrame(data=model.feature_importances_,
             index=encoded_cat_cols.tolist()+numerical_features,
             columns=['importance'])
fi = fi.sort_values(by='importance', ascending=False).head(10)
fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()

In [None]:
X_holdout_cat = X_holdout[categorical_features]
X_holdout_num = X_holdout[numerical_features]

X_holdout_encoded = np.hstack([
    encoder.transform(X_holdout_cat).toarray(),
    X_holdout_num
])
holdout_predictions = model.predict(X_holdout_encoded)


baseline_columns = ['XG_MA', "TEAM_ODDS_2_SCORE", 'ATT_TOTAL']
output_cols = list(set(["index"]+numerical_features+categorical_features+[target]+baseline_columns))
output_df = holdout_data[output_cols].copy()
eval_cols = ["prediction"]+baseline_columns
output_df["prediction"] = holdout_predictions

fig, axes = plt.subplots(nrows=1, ncols=len(eval_cols), figsize=(20, 5), sharey=True)

for i, col in enumerate(eval_cols):
    output_df[f"{col}_error"] = output_df[col] - output_df[target]
    output_df[f"{col}_error"].hist(ax=axes[i], bins=np.arange(-3.5, 3.5, 0.1), color=color_pal[i])
    mae = output_df[f"{col}_error"].abs().mean()
    axes[i].set_xlabel(f"{col} MAE: {mae:.2f}")
output_df.head()
plt.subplots_adjust(wspace=0.1)
plt.show()