In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# Importing scikit-learn modules for machine learning tasks
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Import machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [2]:
# Load the data
data = pd.read_csv('../data/processed/data_processed.csv')

In [3]:
# Initialize different classifiers for evaluation
models = {
    'LogisticRegression': LogisticRegression(C=0.1),
    'DecisionTreeClassifier': DecisionTreeClassifier(max_depth=5),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=100, max_depth=5),
    'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=5),
    'SVC': SVC(C=0.1)
}


In [4]:
#Feature and target variables
X_cat = ['formation']
X_num = data.drop(labels=['date',
                          'round',
                          'result',
                          'gf',
                          'ga',
                          'opponent',
                          'formation',
                          'season',
                          'team',
                          'gdiff',
                          'xgdiff',
                          'points',
                          'exppoints'],
                          axis=1
                 ).columns

In [29]:
# Create function which fits all the models from the dictionary above
def model_function(data, target, X_cat, X_num, models, test_size=0.2, random_state=42):
    """
    Trains and evaluates multiple models with preprocessing pipelines for both categorical 
    and numerical features, returning model performance including accuracy, confusion matrices, 
    and classification reports.

    :param data: the input data containing features and target column
    :param target: the name of the target column to be predicted
    :param X_cat: list of categorical feature column names
    :param X_num: list of numerical feature column names
    :param models: a dictionary of model names and their corresponding sklearn classifiers
    :param test_size: the proportion of the data to be used as test set
    :param random_state: random seed for reproducibility
    :return: DataFrame comparing accuracy scores of the models, 
             a dictionary of DataFrames for confusion matrices,
             and a dictionary of DataFrames for classification reports
    """

    # Define preprocessing pipelines
    cat_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    num_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', cat_pipeline, X_cat),
            ('num', num_pipeline, X_num)
        ])

    # Split the data into features (X) and target (y)
    X = data.drop(columns=[target])
    y = data[target]

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Encode target variable
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)

    # Initialize DataFrame for model comparison
    models_comparison = pd.DataFrame(columns=['Model', 'Accuracy'])

    # Initialize dictionaries for confusion matrices and classification reports
    confusion_matrices = {}
    classification_reports = {}

    # Iterate through different classifiers provided in the dictionary
    for name, model in models.items():

        # Create a pipeline for each model
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])

        # Fit the model
        pipeline.fit(X_train, y_train)

        # Predict on test data
        y_pred = pipeline.predict(X_test)

        # Compute accuracy
        acc = accuracy_score(y_test, y_pred)

        # Compute confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)

        # Compute classification report
        class_report = classification_report(y_test, y_pred, output_dict=True)

        # Append results to the accuracy comparison DataFrame
        models_comparison = pd.concat([models_comparison, pd.DataFrame({'Model': [name], 'Accuracy': [acc]})])

        # Store the confusion matrix as a DataFrame
        confusion_matrices[name] = pd.DataFrame(conf_matrix, 
                                                 index=[f'True {i}' for i in range(conf_matrix.shape[0])],
                                                 columns=[f'Pred {i}' for i in range(conf_matrix.shape[1])])

        # Store the classification report as a DataFrame
        classification_reports[name] = pd.DataFrame(class_report).transpose()

    # Sort by accuracy
    models_comparison = models_comparison.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)

    return models_comparison, confusion_matrices, classification_reports

In [31]:
#Display the results
model_function(data, 'result', X_cat, X_num, models, test_size=0.2, random_state=42)

  models_comparison = pd.concat([models_comparison, pd.DataFrame({'Model': [name], 'Accuracy': [acc]})])


(                    Model  Accuracy
 0  RandomForestClassifier  0.719737
 1      LogisticRegression  0.713158
 2                     SVC  0.713158
 3  DecisionTreeClassifier  0.682895
 4    KNeighborsClassifier  0.677632,
 {'LogisticRegression':         Pred 0  Pred 1  Pred 2
  True 0      39      72      67
  True 1      25     243      19
  True 2      20      15     260,
  'DecisionTreeClassifier':         Pred 0  Pred 1  Pred 2
  True 0      26      66      86
  True 1      29     226      32
  True 2       9      19     267,
  'RandomForestClassifier':         Pred 0  Pred 1  Pred 2
  True 0       5      89      84
  True 1       1     255      31
  True 2       0       8     287,
  'KNeighborsClassifier':         Pred 0  Pred 1  Pred 2
  True 0      57      67      54
  True 1      45     218      24
  True 2      41      14     240,
  'SVC':         Pred 0  Pred 1  Pred 2
  True 0      19      89      70
  True 1      12     253      22
  True 2       7      18     270},
 {'Log