In [1]:
#First Model using SVM

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

df = pd.read_csv('final_dataset.csv')


#Drop the 'Unnamed: 0'
df.drop(['Unnamed: 0'], axis=1, inplace=True)

#Drop date column
df.drop(['Date'], axis=1, inplace=True)


#'FTR'(Full Time Result) is the target variable
X = df.drop(['FTR'], axis=1)
y = df['FTR']

#One-hot encode categorical variables
categorical_features = ['HomeTeam', 'AwayTeam', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'HTFormPtsStr', 'ATFormPtsStr']
one_hot_encoder = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)], remainder='passthrough')

#Split dataset into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Create pipeline with StandardScaler, OneHotEncoder, and SVC
pipeline = Pipeline(steps=[('encoder', one_hot_encoder), ('scaler', StandardScaler(with_mean=False)), ('svc', SVC(kernel='linear'))])

#Train
pipeline.fit(X_train, y_train)

#Predict
y_pred = pipeline.predict(X_test)

# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           H       1.00      1.00      1.00       634
          NH       1.00      1.00      1.00       734

    accuracy                           1.00      1368
   macro avg       1.00      1.00      1.00      1368
weighted avg       1.00      1.00      1.00      1368

Accuracy Score: 1.0


In [None]:
#Second model after dropping FHTG and FTAG

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

df = pd.read_csv('final_dataset.csv')

#Drop the 'Unnamed: 0'
df.drop(['Unnamed: 0'], axis=1, inplace=True)

#Drop date column
df.drop(['Date'], axis=1, inplace=True)

#Drop 'FTHG' and 'FTAG' columns
df.drop(['FTHG', 'FTAG'], axis=1, inplace=True)

#'FTR' (Full Time Result) is the target variable
X = df.drop(['FTR'], axis=1)
y = df['FTR']

#One-hot encode categorical variables
categorical_features = ['HomeTeam', 'AwayTeam', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'HTFormPtsStr', 'ATFormPtsStr']
one_hot_encoder = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)], remainder='passthrough')

#Split dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Create a pipeline with a StandardScaler, OneHotEncoder, and SVC
pipeline = Pipeline(steps=[('encoder', one_hot_encoder), ('scaler', StandardScaler(with_mean=False)), ('svc', SVC(kernel='linear'))])

#Train Model
pipeline.fit(X_train, y_train)

#Prediction
y_pred = pipeline.predict(X_test)

#Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
#Test to see which Kernal Function performed best independently of C & Gamma parameters

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

df = pd.read_csv('final_dataset.csv')


#Drop the 'Unnamed: 0'
df.drop(['Unnamed: 0'], axis=1, inplace=True)

#Drop date column
df.drop(['Date', 'FTHG', 'FTAG'], axis=1, inplace=True)


#'FTR' (Full Time Result) is the target variable
X = df.drop(['FTR'], axis=1)
y = df['FTR']

#One-hot encode categorical variables
categorical_features = ['HomeTeam', 'AwayTeam', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'HTFormPtsStr', 'ATFormPtsStr']
one_hot_encoder = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)], remainder='passthrough')

#Split the dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Create a pipeline with a StandardScaler, OneHotEncoder, and SVC with different kernels
kernels = ['linear', 'rbf', 'poly']  # List of kernels to try

for kernel in kernels:
    pipeline = Pipeline(steps=[('encoder', one_hot_encoder), ('scaler', StandardScaler(with_mean=False)), ('svc', SVC(kernel=kernel))])

    #Train the SVM model
    pipeline.fit(X_train, y_train)

    #Prediction
    y_pred = pipeline.predict(X_test)

    #Evaluation
    print(f"Kernel: {kernel}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))
    print("\n")


In [None]:
#Cross Validation of Kernal-Linear Model

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


df = pd.read_csv('final_dataset.csv')


#Drop the 'Unnamed: 0'
df.drop(['Unnamed: 0'], axis=1, inplace=True)

#Drop the Date column
df.drop(['Date'], axis=1, inplace=True)

#Drop 'FTHG' and 'FTAG' columns
df.drop(['FTHG', 'FTAG'], axis=1, inplace=True)

#'FTR' (Full Time Result) is the target variable
X = df.drop(['FTR'], axis=1)
y = df['FTR']

#One-hot encode categorical variables
categorical_features = ['HomeTeam', 'AwayTeam', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'HTFormPtsStr', 'ATFormPtsStr']
one_hot_encoder = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)], remainder='passthrough')

#Split the dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Creat a pipeline with a StandardScaler, OneHotEncoder, and SVC
pipeline = Pipeline(steps=[('encoder', one_hot_encoder), ('scaler', StandardScaler(with_mean=False)), ('svc', SVC(kernel='linear'))]

#5-fold cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

#Training
pipeline.fit(X_train, y_train)

#Prediction
y_pred = pipeline.predict(X_test)

#Evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
  

In [None]:
#Introducing Hyperparameters C (Gamma is not applicable here because linear kernel function doesn't have a gamma parameter). 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


df = pd.read_csv('final_dataset.csv')

#Drop the 'Unnamed: 0'
df.drop(['Unnamed: 0'], axis=1, inplace=True)

#Drop the Date column
df.drop(['Date'], axis=1, inplace=True)

#Drop 'FTHG' and 'FTAG' columns
df.drop(['FTHG', 'FTAG'], axis=1, inplace=True)

#'FTR' (Full Time Result) is the target variable
X = df.drop(['FTR'], axis=1)
y = df['FTR']

#One-hot encode categorical variables
categorical_features = ['HomeTeam', 'AwayTeam', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'HTFormPtsStr', 'ATFormPtsStr']
one_hot_encoder = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)], remainder='passthrough')

#Spli into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Creat a pipeline with StandardScaler, OneHotEncoder, and SVC with specified C and Gamma
# Adjust the C
pipeline = Pipeline(steps=[('encoder', one_hot_encoder), ('scaler', StandardScaler(with_mean=False)), ('svc', SVC(kernel='linear', C=1.0))])

#Training
pipeline.fit(X_train, y_train)

#Prediction
y_pred = pipeline.predict(X_test)

#Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
#Hyperparameter Tuning of C

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('final_dataset.csv')


#Drop the 'Unnamed: 0'
df.drop(['Unnamed: 0'], axis=1, inplace=True)

#Drop the Date column
df.drop(['Date'], axis=1, inplace=True)

#Drop 'FTHG' and 'FTAG' columns
df.drop(['FTHG', 'FTAG'], axis=1, inplace=True)

#'FTR' (Full Time Result) is the target variable
X = df.drop(['FTR'], axis=1)
y = df['FTR']

#One-hot encode categorical variables
categorical_features = ['HomeTeam', 'AwayTeam', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'HTFormPtsStr', 'ATFormPtsStr']
one_hot_encoder = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)], remainder='passthrough')

#Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Creat a pipeline with StandardScaler, OneHotEncoder, and SVC with specified C and Gamma
#Adjust the C
pipeline = Pipeline(steps=[('encoder', one_hot_encoder), ('scaler', StandardScaler(with_mean=False)), ('svc', SVC(kernel='linear', C=1.0))])

#Training
pipeline.fit(X_train, y_train)


#Parameter grid search
param_grid = {
    'svc__C': [0.1, 1, 10, 100],  # Example values for C
}

#GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2)

#Fit the grid search
grid_search.fit(X_train, y_train)

#Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

#Prediction
y_pred = grid_search.predict(X_test)

#Evaluation
print("Classification Report for the best model:")
print(classification_report(y_test, y_pred))
print("Accuracy Score for the best model:", accuracy_score(y_test, y_pred))


In [None]:
#Grid Search Testing all at the same time

#Grid Search as hyperparameter tuning

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('final_dataset.csv')


#Drop the 'Unnamed: 0'
df.drop(['Unnamed: 0'], axis=1, inplace=True)

#Drop the Date
df.drop(['Date', 'FTHG', 'FTAG'], axis=1, inplace=True)


#'FTR' (Full Time Result) is the target variable
X = df.drop(['FTR'], axis=1)
y = df['FTR']

#One-hot encode categorical variables
categorical_features = ['HomeTeam', 'AwayTeam', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'HTFormPtsStr', 'ATFormPtsStr']
one_hot_encoder = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)], remainder='passthrough')

#Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Creat a pipeline with StandardScaler, OneHotEncoder, and SVC
pipeline = Pipeline(steps=[('encoder', one_hot_encoder), ('scaler', StandardScaler(with_mean=False)), ('svc', SVC())])

#Hyperparameter tuning using GridSearchCV
param_grid = {
    'svc__C': [0.1, 1, 10, 100],  # Regularization parameter
    'svc__kernel': ['linear', 'rbf'],  # Kernel type
    'svc__gamma': ['scale', 'auto']  # Kernel coefficient (for 'rbf' kernel)
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

#Training
grid_search.fit(X_train, y_train)

#Find best estimator
best_pipeline = grid_search.best_estimator_

#Prediction
y_pred = best_pipeline.predict(X_test)

#Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Best Parameters:", grid_search.best_params_)

In [None]:
#Feature Selection
#Recursive Feature Elimination (RFE) with a model that can provide feature importance or coefficients e.g. SVM

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_selection import RFE  # Import RFE for feature selection
from sklearn.metrics import classification_report, accuracy_score


df = pd.read_csv('final_dataset.csv')


#Drop the 'Unnamed: 0'
df.drop(['Unnamed: 0'], axis=1, inplace=True)

#Drop the Date column & FHTG and FTAG
df.drop(['Date', 'FTHG', 'FTAG'], axis=1, inplace=True)

#'FTR' (Full Time Result) is the target variable
X = df.drop(['FTR'], axis=1)
y = df['FTR']

#One-hot encode categorical variables
categorical_features = ['HomeTeam', 'AwayTeam', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'HTFormPtsStr', 'ATFormPtsStr']
one_hot_encoder = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)], remainder='passthrough')

#Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Creat a pipeline with StandardScaler, OneHotEncoder, and SVC
svc_for_rfe = SVC(kernel='linear')  # Base model for RFE
rfe = RFE(estimator=svc_for_rfe, n_features_to_select=10) # Wrap RFE around the estimator, adjust n_features_to_select as needed

pipeline = Pipeline(steps=[('encoder', one_hot_encoder), ('scaler', StandardScaler(with_mean=False)), ('rfe', rfe), ('svc', SVC(kernel='linear'))])

#Training
pipeline.fit(X_train, y_train)

#Predictions
y_pred = pipeline.predict(X_test)

#Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))