The Goal here (pun intended) is to design a prediction system which can accurately predict if the home team will win or not. We will use the final dataset got by our earlier "Scraping and Cleaning" Notebook build our prediction model on.

In [1]:
# Import the necessary libraries.
import pandas as pd
import numpy as np
import xgboost as xgb
from time import time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from IPython.display import display

pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

In [2]:
# Read data and drop redundant column.
data = pd.read_csv('../Datasets/final_dataset_complete.csv')

# Remove first 3 matchweeks
data = data[data.MW > 3]

teams = pd.concat([data['HomeTeam'], data['AwayTeam']], axis=1, keys=['HomeTeam', 'AwayTeam'])


#data.drop(['Unnamed: 0','HomeTeam', 'AwayTeam','Date', 'MW', 'HTFormPtsStr', 'ATFormPtsStr', 'FTHG', 'FTAG',
#           'HTGS', 'ATGS', 'HTGC', 'ATGC','HomeTeamLP', 'AwayTeamLP','DiffPts','HTFormPts','ATFormPts',
#           'HM4','HM5','AM4','AM5','HTLossStreak5','ATLossStreak5','HTWinStreak5','ATWinStreak5',
#           'HTWinStreak3','HTLossStreak3','ATWinStreak3','ATLossStreak3'],1, inplace=True)
data.drop(['Unnamed: 0','HomeTeam', 'AwayTeam','Date', 'MW', 'HTFormPtsStr', 'ATFormPtsStr', 'FTHG', 'FTAG',
           'HTGS', 'ATGS', 'HTGC', 'ATGC','HomeTeamLP', 'AwayTeamLP','DiffPts','HTFormPts','ATFormPts',
           'HM4','AM4','HTLossStreak5','ATLossStreak5','HTWinStreak5','ATWinStreak5',
           'HTWinStreak3','HTLossStreak3','ATWinStreak3','ATLossStreak3'],1, inplace=True)

## Preparing the Data

In [3]:
# Separate into feature set and target variable
X_all = data.drop(['FTR'],1)
y_all = data['FTR']

# Standardising the data.
from sklearn.preprocessing import scale


cols = [['HTGD','ATGD','HTP','ATP','DiffLP']]
for col in cols:
    X_all[col] = scale(X_all[col])

In [4]:
X_all.HM1 = X_all.HM1.astype('str')
X_all.HM2 = X_all.HM2.astype('str')
X_all.HM3 = X_all.HM3.astype('str')
X_all.AM1 = X_all.AM1.astype('str')
X_all.AM2 = X_all.AM2.astype('str')
X_all.AM3 = X_all.AM3.astype('str')

def preprocess_features(X):
    ''' Preprocesses the football data and converts catagorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print( "Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)) )

Processed feature columns (24 total features):
['HTP', 'ATP', 'HM1_D', 'HM1_L', 'HM1_W', 'HM2_D', 'HM2_L', 'HM2_W', 'HM3_D', 'HM3_L', 'HM3_W', 'AM1_D', 'AM1_L', 'AM1_W', 'AM2_D', 'AM2_L', 'AM2_W', 'AM3_D', 'AM3_L', 'AM3_W', 'HTGD', 'ATGD', 'DiffFormPts', 'DiffLP']


## Training and Evaluating Models

In [5]:
from sklearn.metrics import f1_score
   
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start) )
    
    return f1_score(target, y_pred, pos_label='H'), sum(target == y_pred) / float(len(y_pred)), y_pred

# STEP 1: select the portion of matches to be predicted (the X_test dataframe)

In [6]:
N_MATCHES_TO_PREDICT = 10

X_train = X_all.iloc[:-N_MATCHES_TO_PREDICT, :]
y_train = y_all[:-N_MATCHES_TO_PREDICT]
X_test  = X_all.iloc[-N_MATCHES_TO_PREDICT:, :]
y_test  = y_all[-N_MATCHES_TO_PREDICT:]

# STEP 2: load the previously trained classifier

In [7]:
from sklearn.externals import joblib
clf = joblib.load('football_classifier.pkl')
print(clf)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.73651805870889131, gamma=0.01,
       learning_rate=0.48237187313753049, max_delta_step=0, max_depth=9,
       min_child_weight=0, missing=nan, n_estimators=76, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=2,
       reg_alpha=8.1959276142102205e-05, reg_lambda=1,
       scale_pos_weight=1.4319919478841214, seed=None, silent=True,
       subsample=1.0)


# STEP 3: make the predictions

In [8]:
# Report the final F1 score for training and testing after parameter tuning
f1, acc, y_train_pred = predict_labels(clf, X_train, y_train)
print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
f1, acc, y_pred = predict_labels(clf, X_test, y_test)
print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

Made predictions in 0.0781 seconds.
F1 score and accuracy score for training set: 0.9770 , 0.9787.
Made predictions in 0.0014 seconds.
F1 score and accuracy score for test set: 0.2500 , 0.4000.


# STEP 4: store the predictions into a CSV file

In [9]:
teams_pred  = teams.iloc[-N_MATCHES_TO_PREDICT:, :]

df_y_pred   = pd.DataFrame(y_pred,index=teams_pred.index.tolist(),columns=['Predictions']) 

predictions = pd.concat(
    [teams_pred['HomeTeam'],
     teams_pred['AwayTeam'],
     df_y_pred['Predictions']],
    axis=1, keys=['HomeTeam', 'AwayTeam', 'Predictions'])

print(predictions)

            HomeTeam     AwayTeam Predictions
6500     Bournemouth     Brighton          NH
6501  Crystal Palace  Southampton           H
6502    Huddersfield    Leicester          NH
6503       Liverpool      Burnley          NH
6504       Newcastle        Stoke           H
6505         Watford     Man City          NH
6506       West Brom     West Ham           H
6507       Tottenham      Swansea           H
6508         Chelsea      Arsenal           H
6509      Man United      Everton           H


In [10]:
predictions.to_csv('predictions.csv', sep=',')