In [1]:
import pandas as pd
import numpy as np 
import sklearn 
import pickle 
import os

import sys

sys.path.append('../../src/features')

import build_features

import warnings 
warnings.filterwarnings('ignore')



matches = pd.read_csv('../../data/raw/matches.csv')
matches.head()

Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
1,2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
2,3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
3,4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
4,6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


In [None]:
matches.columns

## EDA - Insights & Actionables 

In [None]:

DataCleaner = build_features.CleanDataset(matches)
cleaned_data = DataCleaner.main()
DataTransformer = build_features.FeatureAddition(cleaned_data)
matches_processed = DataTransformer.main()
matches_processed.head()

In [None]:

predictors = matches_processed.drop(['date','time','day', 'result','opponent','season','team','pkr', 'pkatt_rolling', 'poss', 'xg_rolling','xga_rolling', 'dist_rolling',
                                     'sh_rolling','sot_rolling' ,'datetime','target'],  axis = 1)

scaled_predictors = build_features.standardize_data(predictors)

target = pd.DataFrame(matches_processed['target'])

In [None]:
predictors.columns

## Model Building 

### Defining Train & Test Sets 

In [None]:
scaled_predictors['Year'] =  matches_processed.datetime.dt.year
target['Year'] =      matches_processed.datetime.dt.year

# Training Set 

X_train =  scaled_predictors[scaled_predictors.Year == 2022].drop('Year', axis = 1)
Y_train =  target[target.Year == 2022].drop('Year', axis = 1)


# Test Set 

X_test = scaled_predictors[scaled_predictors.Year < 2022].drop('Year', axis = 1)
Y_test = target[target.Year < 2022].drop('Year', axis = 1)


scaled_predictors.drop('Year', axis=1, inplace=True)
target.drop('Year', axis=1, inplace=True)


### Model Selection

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score


X = X_train
y = Y_train


# Number of folds for time series cross-validation
n_splits = 3
tscv = TimeSeriesSplit(n_splits=n_splits)

# Define the models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'XGBoost': XGBClassifier(random_state= 42),
    'Random Forest': RandomForestClassifier(random_state= 42)
}

# Dictionary to store cross-validation results
cv_results = {}

# Loop through the models and perform cross-validation
for model_name, model in models.items():
    precision_scores = []  # To store precision scores for each fold
    
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Fit the model on the training data
        model.fit(X_train, y_train)
        
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        
        # Calculate precision and store it
        precision = precision_score(y_test, y_pred)
        precision_scores.append(precision)
    
      # Store the average precision and median precision for this model
    cv_results[model_name] = {
        'Average Precision': np.mean(precision_scores),
        'Median Precision': np.median(precision_scores)
    }

# Print the cross-validation results
for model_name, scores in cv_results.items():
    print(f'{model_name}:')
    print(f'Average Precision = {scores["Average Precision"]:.4f}')
    print(f'Median Precision = {scores["Median Precision"]:.4f}')
    print()

### Model Opimization 

### The Final Model 

## Model Diaganostics & Evaluation