In [44]:
# Block 1: Importing Libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder  # Import LabelEncoder

# BLOCK 2

In [35]:

# List of seasons you want to load (excluding 2018-2019 for now)
seasons_train = ['1314', '1415', '1516', '1617', '1718']

# Initialize an empty list to store the training data
train_data_list = []

# Loop through each season and load the respective CSV file for training
for season in seasons_train:
    file_name = f'Datasets/Processed/season-{season}_with_weather.csv'  # Adjust the file path if necessary
    season_data = pd.read_csv(file_name)
    season_data['Season'] = season  # Add a column to keep track of the season
    train_data_list.append(season_data)

# Combine all the training data into a single DataFrame
train_data = pd.concat(train_data_list, ignore_index=True)

# Load the 2018-2019 season separately for prediction and comparison
test_season = '1819'
test_data = pd.read_csv(f'Datasets/Processed/season-{test_season}_with_weather.csv')
test_data['Season'] = test_season  # Add a column for season tracking

# Preview the data
train_data.head(), test_data.head()


(       Date               HomeTeam  AwayTeam  FTHG  FTAG FTR  HTHG  HTAG HTR  \
 0  09/08/13            Montpellier  Paris SG     1     1   D   1.0   0.0   H   
 1  10/08/13               Bordeaux    Monaco     0     2   A   0.0   0.0   D   
 2  10/08/13  Evian Thonon Gaillard   Sochaux     1     1   D   1.0   0.0   H   
 3  10/08/13                  Lille   Lorient     1     0   H   1.0   0.0   H   
 4  10/08/13                   Lyon      Nice     4     0   H   1.0   0.0   H   
 
    Referee  ...   AC   HY   AY   HR   AR  temperature  wind_speed  \
 0      NaN  ...  8.0  2.0  2.0  1.0  0.0    22.995833   28.020833   
 1      NaN  ...  7.0  1.0  0.0  0.0  0.0    20.550000    6.437500   
 2      NaN  ...  5.0  1.0  2.0  0.0  0.0    18.820833    8.229167   
 3      NaN  ...  3.0  1.0  1.0  0.0  1.0    17.225000    9.391667   
 4      NaN  ...  5.0  1.0  0.0  0.0  0.0    20.612500   13.066667   
 
    precipitation  Season  Div  
 0            0.0    1314  NaN  
 1            0.0    131

In [None]:
# BLOCK 3

In [56]:
# Block B3: Prepare Features and Target

# Define all columns as features (including weather-related features)
features = ['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee', 
            'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'temperature', 'wind_speed', 'precipitation']

# Prepare features (X) and target (y) for the training data
X_train = train_data[features]
y_train = train_data['FTR']

X_test = test_data[features]
y_test = test_data['FTR']

# Set pandas option to opt-in to future behavior
pd.set_option('future.no_silent_downcasting', True)

# Replace values in y_train and y_test
y_train = y_train.replace({'D': 0, 'H': 1, 'A': 2}).infer_objects(copy=False)
y_test = y_test.replace({'D': 0, 'H': 1, 'A': 2}).infer_objects(copy=False)

# Print the first few rows of y_train and y_test
print("y_train after replace:")
print(y_train.head())
print("y_test after replace:")
print(y_test.head())

# Map team names in X_train and X_test
X_train.loc[:, 'HomeTeam'] = X_train['HomeTeam'].map(team_mapping)
X_train.loc[:, 'AwayTeam'] = X_train['AwayTeam'].map(team_mapping)
X_test.loc[:, 'HomeTeam'] = X_test['HomeTeam'].map(team_mapping)
X_test.loc[:, 'AwayTeam'] = X_test['AwayTeam'].map(team_mapping)

# Print transformed X_train and X_test (first few rows)
print("X_train after team mapping:")
print(X_train.head())
print("X_test after team mapping:")
print(X_test.head())

# Encode referees
X_train.loc[:, 'Referee'] = referee_encoder.fit_transform(X_train['Referee'])
X_test.loc[:, 'Referee'] = referee_encoder.transform(X_test['Referee'])

# Print transformed Referee column
print("X_train 'Referee' column after encoding:")
print(X_train['Referee'].head())
print("X_test 'Referee' column after encoding:")
print(X_test['Referee'].head())

# Map HTR values
X_train.loc[:, 'HTR'] = X_train['HTR'].map({'D': 0, 'H': 1, 'A': 2})
X_test.loc[:, 'HTR'] = X_test['HTR'].map({'D': 0, 'H': 1, 'A': 2})

# Print transformed 'HTR' column
print("X_train 'HTR' column after mapping:")
print(X_train['HTR'].head())
print("X_test 'HTR' column after mapping:")
print(X_test['HTR'].head())

# Drop 'Date' column
X_train = X_train.drop('Date', axis=1)
X_test = X_test.drop('Date', axis=1)

# Print shapes after dropping 'Date'
print("X_train shape after dropping 'Date':", X_train.shape)
print("X_test shape after dropping 'Date':", X_test.shape)

# Fill missing numeric values with column means
X_train.loc[:, numeric_columns] = X_train[numeric_columns].fillna(X_train[numeric_columns].mean()).infer_objects(copy=False)
X_test.loc[:, numeric_columns] = X_test[numeric_columns].fillna(X_test[numeric_columns].mean()).infer_objects(copy=False)

# Print first few rows after filling missing values
print("X_train after filling missing values:")
print(X_train.head())
print("X_test after filling missing values:")
print(X_test.head())


y_train after replace:
0    0
1    2
2    0
3    1
4    1
Name: FTR, dtype: int64
y_test after replace:
0    1
1    2
2    1
3    2
4    2
Name: FTR, dtype: int64
X_train after team mapping:
       Date HomeTeam AwayTeam  FTHG  FTAG FTR  HTHG  HTAG HTR  Referee  ...  \
0  09/08/13        0       19     1     1   D   1.0   0.0   H      NaN  ...   
1  10/08/13        1       18     0     2   A   0.0   0.0   D      NaN  ...   
2  10/08/13        2       10     1     1   D   1.0   0.0   H      NaN  ...   
3  10/08/13        3       17     1     0   H   1.0   0.0   H      NaN  ...   
4  10/08/13        4       13     4     0   H   1.0   0.0   H      NaN  ...   

     AF   HC   AC   HY   AY   HR   AR  temperature  wind_speed  precipitation  
0  18.0  1.0  8.0  2.0  2.0  1.0  0.0    22.995833   28.020833            0.0  
1  11.0  3.0  7.0  1.0  0.0  0.0  0.0    20.550000    6.437500            0.0  
2  18.0  5.0  5.0  1.0  2.0  0.0  0.0    18.820833    8.229167            0.0  
3  18.0  4.0  

In [None]:
# BLOCK 4

In [46]:
# Initialize the models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'LightGBM': lgb.LGBMClassifier(),
    'CatBoost': cb.CatBoostClassifier(verbose=0)
}

# Train the models and evaluate
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train_encoded)  # Train on the encoded target
    
    # Predict the outcomes for both models on the test data
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test_encoded, y_pred)
    print(f"{model_name} accuracy: {accuracy}\n")


Training Logistic Regression...


ValueError: could not convert string to float: 'D'