# <center>Football match result prediction</center>

### Create functions for evaluation:
- Class report
- Confusion matrix

In [None]:
def Fancy_Class_Report(y_true, y_pred):
    report = classification_report(y_true,y_pred, output_dict=True)

    classes = list(report.keys())[:-3]
    metrics = ['precision', 'recall', 'f1-score', 'support']

    classReport = []
    for class_name in classes:
        row = [report[class_name][metric] for metric in metrics]
        classReport.append(row)
    classReport = np.array(classReport)

    sns.heatmap(classReport, annot=True, fmt='.3f', cmap='Blues', xticklabels=metrics, yticklabels=classes, vmin=0.0, vmax=1.0)
    plt.title('Classification Report pt.1')
    plt.show()


    ifications = list(report.keys())[-3:]
    macro_avg = [report['macro avg'][metric] for metric in metrics]
    weighted_avg = [report['weighted avg'][metric] for metric in metrics]
    accuracy = report['accuracy']
    precision = precision_score(y_true, y_pred, average='macro')
    
    ificationReport = np.array([macro_avg, weighted_avg])
    sns.heatmap(ificationReport, annot=True, fmt='.3f', cmap='Greens', xticklabels=metrics, 
                yticklabels=['macro avg', 'weighted avg'], vmin=0.0, vmax=1.0)

    print("Accuracy is: ", accuracy)
    print("Precision score is: ", precision)

    plt.title('Classification Report pt.2')
    plt.show()

def Fancy_Confusion_Matrix(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)

    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Reds', 
                xticklabels=['Predicted D or L', 'Predicted W'],
                yticklabels=['Actually D or L','Actually W'])
    print(conf_matrix)
    plt.title('Confusion Matrix')
    plt.show()


### Import the necessary modules

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report , precision_score
import seaborn as sns
import matplotlib.pyplot as plt # to help in plotting results in a readable manner
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight

#SVM
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import RandomizedSearchCV , GridSearchCV
#DNN
import keras
from keras import Sequential , layers
from keras.layers import Dense , Dropout , Activation
from keras.wrappers.scikit_learn import KerasClassifier 
import tensorflow as tf
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV
from scipy.stats import randint


### Loading the data:
- load the csv file with the data
- use head to show the first five column of the inital table before any modifications


In [None]:
matches_initial = pd.read_csv("matches.csv", index_col=0)
matches_initial.head()

### Preprocessing the data

#### Testing data with random forest before it is modified:

In [None]:
matches_initial.index = range(matches_initial.shape[0])

In [None]:
matches_initial = matches_initial.drop("Match Report", axis=1)
matches_initial = matches_initial.drop("Notes", axis=1)
matches_initial.head()


In [None]:
pd.set_option('display.max_columns', None)
matches_initial.head()


In [None]:
print(38 * 20 * 5)
matches_initial.shape

In [None]:
print(matches_initial["Team"].value_counts())
4+4+4+4+4+4+4+4+4+5+5+5+5+5+5+5+6+6+6+7

In [None]:
matches_initial.dtypes

In [None]:
matches_initial["Date"] = pd.to_datetime(matches_initial["Date"])
matches_initial.dtypes

In [None]:
matches_initial["Venue_Code"] = matches_initial["Venue"].astype("category").cat.codes
matches_initial["Opponent_Code"] = matches_initial["Opponent"].astype("category").cat.codes
matches_initial["Hour"] = matches_initial["Time"].str.replace(":.+","", regex=True).astype("int")
matches_initial["Day_Code"] = matches_initial["Date"].dt.dayofweek
matches_initial["Target"] = (matches_initial["Result"] == "W").astype("int")
matches_initial.head()

###### Random forest :
To test how the accuracy changed after modifying the data we test it with a classifier before and after


In [None]:
rf_clf = RandomForestClassifier(n_estimators=1000, min_samples_split=10, random_state=42)
initial_train = matches_initial[matches_initial["Date"] <= '2022-01-01']
initial_test = matches_initial[matches_initial["Date"] > '2022-01-01']
initial_preds = ["Venue_Code", "Opponent_Code", "Hour", "Day_Code"]
rf_clf.fit(initial_train[initial_preds], initial_train["Target"])
predictions_initial = rf_clf.predict(initial_test[initial_preds])
accuracy = accuracy_score(initial_test["Target"], predictions_initial)


In [None]:
y_true_pred = pd.DataFrame(dict(Actual=initial_test["Target"], Prediction=predictions_initial))
print(y_true_pred)
print(classification_report(y_true_pred["Actual"], y_true_pred["Prediction"]))
Fancy_Class_Report(y_true_pred["Actual"], y_true_pred["Prediction"])
Fancy_Confusion_Matrix(y_true_pred["Actual"], y_true_pred["Prediction"])

##### Modify the data:
- form function adds more predictors by adding average of last 5 games
- make_predicctions function takes in data and predictors and outputs confusion matrix and classificationn report and then gives dictionary with actual and predicted values
- MssingDictionary finction normalises the team names

In [None]:
def form(team, columns, new_columns):
    team = team.sort_values("Date")
    form_stats = team[columns].rolling(5, closed='left').mean()
    team[new_columns] = form_stats
    team = team.dropna(subset=new_columns)
    return team

In [None]:
def make_predictions_non_ann(data, predictors, clf):
    train = data[data["Date"] <= '2022-01-01']
    test = data[data["Date"] > '2022-01-01']
    
    clf.fit(train[predictors], train["Target"])
    predicts = clf.predict(test[predictors])
    
    
    true_pred_dict = pd.DataFrame(dict(Actual=test["Target"], Prediction=predicts, index=test.index))
    Fancy_Confusion_Matrix(true_pred_dict['Actual'], true_pred_dict['Prediction'])
    Fancy_Class_Report(true_pred_dict['Actual'], true_pred_dict['Prediction'])
    print(classification_report(true_pred_dict['Actual'], true_pred_dict['Prediction']))
    return true_pred_dict

In [None]:
class MissingDictionary(dict):
    __missing__ = lambda self, key: key

mapping_vals = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
    "Nottingham Forest": "Nott'ham Forest",
    "West Bromwich Albion": "West Brom",
    "Sheffield United": "Sheffield Utd",
    "Huddersfield Town": "Huddersfield"
    
}
mapping_ = MissingDictionary(**mapping_vals)

In [None]:
f_col_names = ["xG", "xGA", "Poss", "Dist", "FK", "PK", "PKatt"]
form_columns = [f"{c}_form" for c in f_col_names]
matches = matches_initial.groupby("Team").apply(lambda x: form(x, f_col_names, form_columns))
matches = matches.droplevel("Team")
matches["Formation_Code"] = matches["Formation"].astype("category").cat.codes
print(matches.head())


formation = ["Formation_Code"]
predictors = initial_preds + form_columns + formation
results_table =make_predictions_non_ann(matches, predictors,rf_clf)
results_table = results_table.merge(matches[["Date", "Team", "Opponent", "Result"]], left_index=True, right_index=True)
results_table["Normalised_Team"] = results_table["Team"].map(mapping_)
results_merged = results_table.merge(results_table, left_on=["Date", "Normalised_Team"], right_on=["Date", "Opponent"])
print(results_merged[(results_merged["Prediction_x"]==1) & (results_merged["Prediction_y"]== 0)]["Actual_x"].value_counts())
results_merged


In [None]:
train = matches[matches["Date"] <= '2022-01-01']
test = matches[matches["Date"] > '2022-01-01']


In [None]:
print((results_table["Actual"] == 1).sum())
print((results_table["Actual"] == 0).sum())
print((results_table["Prediction"] == 1).sum())
print((results_table["Prediction"] == 0).sum())
results_table


## SVM model
#### RBF kernel:

In [None]:

# Create the SVM model
svm_model_rbf = SVC(random_state=0)

results_table = make_predictions_non_ann(matches, predictors, svm_model_rbf)
results_table



#### Polynomial kernel:

In [None]:

# Create the SVM model
svm_model = SVC(random_state=0,kernel='poly',C=10,degree=4)

results_table = make_predictions_non_ann(matches, predictors, svm_model)
results_table



## Deep neural network

In [None]:
model = Sequential()
model.add(Dense(32, input_dim=len(predictors), activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42)))
model.add(Dense(16, activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42)))
model.add(Dense(8, activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(train[predictors], train["Target"], epochs=1000, validation_split=0.2, callbacks=[early_stop])

# Evaluate the model on the test data
loss, accuracy = model.evaluate(test[predictors], test["Target"])
print('Test Accuracy:', accuracy)

In [None]:
loss,accuracy = model.evaluate(test[predictors],test["Target"])
print('loss:', loss, 'accuracy:', accuracy)



In [None]:
y_pred_prob = model.predict(test[predictors])

y_pred = np.round(y_pred_prob)

Fancy_Confusion_Matrix(test["Target"], y_pred)
Fancy_Class_Report(test["Target"], y_pred)
print(classification_report(test["Target"], y_pred))


#### Attempting to add extra layers

In [None]:
model = Sequential()
model.add(Dense(64, input_dim=len(predictors), activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42)))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42)))
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42)))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(train[predictors], train["Target"], epochs=1000, validation_split=0.2, callbacks=[early_stop])

# Evaluate the model on the test data
loss, accuracy = model.evaluate(test[predictors], test["Target"])
print('Test Accuracy:', accuracy)

In [None]:
loss,accuracy = model.evaluate(test[predictors],test["Target"])
print('loss:', loss, 'accuracy:', accuracy)


In [None]:
y_pred_prob = model.predict(test[predictors])

y_pred = np.round(y_pred_prob)

Fancy_Confusion_Matrix(test["Target"], y_pred)
Fancy_Class_Report(test["Target"], y_pred)
print(classification_report(test["Target"], y_pred))


By adding the extra layers the DNN is performing worse which might be due to overfitting

## Tuned deep neural network:


#### Create network and wrap it using kerasClassifier to perform gridsearch

In [None]:
'''def create_model(layers, activation):
    model = Sequential()
    for i, nodes in enumerate(layers):
        if i==0:
            model.add(Dense(nodes, input_dim=len(predictors)))
            model.add(Activation(activation=activation))
        else:
            model.add(Dense(nodes))
            model.add(Activation(activation=activation))
    model.add(Dense(1))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model
'''

In [None]:
#model = KerasClassifier(build_fn=create_model,verbose=5)

In [None]:
# Define the hyperparameters to tune
'''layers = [[32], [16, 8], [64, 32, 16]]
activations = ['sigmoid', 'relu']
# Define the grid search parameters
param_grid = dict(layers=layers, activation=activations, batch_size=[16, 32], epochs=[100, 500])
# Create a GridSearchCV object
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

# Fit the GridSearchCV object to the training data
grid_result = grid.fit(train[predictors], train['Target'])

# Print the best score and parameters
print('Best Score:', grid_result.best_score_)
print('Best Params:', grid_result.best_params_)

# Calculate the accuracy on the test data
accuracy = grid.score(test[predictors], test['Target'])
print('Test Accuracy:', accuracy)
'''

it was a fail as gridsearch is computationaly expensive for large datasets, it was taking an incredibly long time to  find the best params 

#### Create network with more hyperparameter options to use Randomsearch:

Due to the fact that GridSearchcv takes a really long time , RandomizedSearchCV had to be used to find the best values faster , it may not be as accurate but we can add more options for it to choose from and it will take less time

In [None]:
'''def create_model(hidden_layers=5, units=16, dropout=0.1, optimizer='adam', activation='relu', epochs=100):
    model = Sequential()
    for i in range(hidden_layers):
        if i == 0:
            model.add(Dense(units=units, input_dim=len(predictors), activation=activation))
            model.add(Dropout(dropout))
        else:
            model.add(Dense(units=units, activation=activation))
            model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    history = model.fit(train[predictors], train['Target'], epochs=epochs, validation_split=0.2)
    return model
'''

In [None]:
#model = KerasClassifier(build_fn=create_model, verbose=5)


In [None]:
# Define the hyperparameters to tune
'''param_dist = {
    'hidden_layers': randint(1, 5),
    'units': randint(8, 64),
    'dropout': [0.1, 0.2, 0.3],
    'optimizer': ['adam', 'rmsprop'],
    'activation': ['relu', 'tanh', 'sigmoid'],
    'epochs': [100, 200, 300]}



# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, verbose=1)

# Fit the RandomizedSearchCV object to the training data
random_search.fit(train[predictors], train['Target'])
'''

In [None]:
# Print the best score and parameters
#print('Best Score:', random_search.best_score_)
#print('Best Params:', random_search.best_params_)


In [None]:

# Calculate the accuracy on the test data
#accuracy = random_search.score(test[predictors], test['Target'])
#print('Test Accuracy:', accuracy)

Due to the fact that random search does not consider all possible combinations of hyperparameters , it was not successful at increasing the dnn's accuracy

# Comparing performance
### Classification report comparison: 
| model type | precision | recall | f1-score| support|
| :-: | :-: | :-: | :-:| :-: |
|Random forest classifier| 0.60 | 0.62 | 0.60 | 1052|
|Random forest classifier modified| 0.64 | 0.65 | 0.62 | 1047|
|SVM model with rbf kernel | 0.64 | 0.64 | 0.52 | 1047|
|SVM model with poly kernel |0.64|0.64|0.57|1047|
|deep neural network |0.62|0.63|0.61|1047|
|deep neural network with extra layers|0.62|0.61|0.48|1047|

### Accuracy comparison: 
- Random forest classifier:61.6%
- Random forest classifier modified:64.8%
- SVM model with rbf kernel :63.7%
- SVM model with poly kernel : 63.8%
- deep neural network :63.6%
- deep neural network with extra layers:61.3%

As it can be seen from the comparison when adding the preprocessed data to the random forest classifier it is the best performing with the highest accuracy