In [23]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# Importing scikit-learn modules for machine learning tasks
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Import machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [24]:
# Load the data
data = pd.read_csv('../data/processed/data_processed.csv')

In [25]:
#Feature and target variables
X_cat = ['formation']
X_num = data.drop(labels=['date',
                          'round',
                          'result',
                          'gf',
                          'ga',
                          'opponent',
                          'formation',
                          'season',
                          'team',
                          'gdiff',
                          'xgdiff',
                          'points',
                          'exppoints'],
                          axis=1
                 ).columns

y = data['result']

In [26]:
# Encode categorical variables
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [27]:
# Scale numerical varaibles
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [28]:
# Combine both pipelines for categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_pipeline, X_cat),
        ('num', num_pipeline, X_num)
    ])

In [29]:
# Split the data into training and testing sets
X = data.drop(labels=['date',
                      'round',
                      'result',
                      'gf',
                      'ga',
                      'opponent',
                      'season',
                      'team',
                      'gdiff',
                      'xgdiff',
                      'points',
                      'exppoints'],
                      axis=1
             )

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# Encode target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [31]:
# Initialize different classifiers for evaluation
models = {
    'LogisticRegression': LogisticRegression(C=0.1),
    'DecisionTreeClassifier': DecisionTreeClassifier(max_depth=5),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=100, max_depth=5),
    'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=5),
    'SVC': SVC(C=0.1)
}


In [32]:
# Create function able to run through the models
def model_function(models, X_train, y_train, X_test, y_test):
    
    """
    Create ML models with the given parameters

        Iterate through different classifiers provided in the dictionary
    
            Create a pipeline for each model
        
            Fit the model
            
            Predict on test data
        
            Compute accuraccy
        
            Append results to the DataFrame
    
        Sort by accuracy
    """
    
    models_comparison = pd.DataFrame(columns=['Model', 'Accuracy'])
    for name, model in models.items():

        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
    
        pipeline.fit(X_train, y_train)
    
        y_pred = pipeline.predict(X_test)
    
        acc = accuracy_score(y_test, y_pred)
        
        models_comparison = pd.concat([models_comparison, pd.DataFrame({'Model': [name], 'Accuracy': [acc]})])
    
    models_comparison = models_comparison.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)
    
    return models_comparison

In [33]:
#Display the results
model_results = model_function(models, X_train, y_train, X_test, y_test)
print(model_results)

  models_comparison = pd.concat([models_comparison, pd.DataFrame({'Model': [name], 'Accuracy': [acc]})])


                    Model  Accuracy
0      LogisticRegression  0.713158
1                     SVC  0.713158
2  RandomForestClassifier  0.703947
3  DecisionTreeClassifier  0.682895
4    KNeighborsClassifier  0.677632
