In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv("seasons-1314-1718.csv")
predict = pd.read_csv("season-1819_csv.csv")
#Replace match full time result (FTR) column values with numerical values, where
# 1: home team win
# 2: away team win
# 0: draw
train = train.replace({"FTR": {"H": 1, "A": 2, "D": 0}})
predict = predict.replace({"FTR": {"H": 1, "A": 2, "D": 0}})

In [3]:
#Initialize a table for saving predicted match results for the 2018-19 Premier League matches
teams = predict.HomeTeam.unique()
zeroes = np.zeros((teams.size), dtype=int)
predicted_table=pd.DataFrame({"Team": teams, "Points": zeroes, "Goal difference": zeroes, "Goals for": zeroes, "Goals against": zeroes})

In [4]:
train_copy = train.copy()
predict_copy = predict.copy()
#Drop columns that are irrelevant for predicting match results (for example "Div" and "Date")
#and columns that contain data that is not available before match and can not be used for predicting the outcome of a match.
train_copy = train_copy.drop(['Div','Date','HTHG','HTAG','HTR','HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR'],axis = 1)
predict_copy = predict_copy.drop(['Div','Date','HTHG','HTAG','HTR','HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR'],axis = 1)
y_train = train[["FTHG", "FTAG", "FTR"]]
#Remove columns that do not have data for every match (for example there can be some betting odds that are not calculated each season)
train_copy = train_copy.dropna(axis='columns')
predict_copy = predict_copy.dropna(axis='columns')
#X_train and X_predict to contain columns from training data that we are using to predict the following columns:
#1. "Full time home goals" (FTHG)
#2. "Full time away goals" (FTAG)
#3. "Full time result" (FTR)
X_train = pd.get_dummies(train_copy, columns = train_copy.select_dtypes(include=['object']).columns.difference(["FTHG", "FTAG", "FTR"]))
X_train = X_train.drop("FTHG",axis=1)
X_train = X_train.drop("FTAG",axis=1)
X_train = X_train.drop("FTR",axis=1)
X_predict = pd.get_dummies(predict_copy, columns = predict_copy.select_dtypes(include=['object']).columns.difference(["FTHG", "FTAG", "FTR"]))
X_predict = X_predict.drop("FTHG",axis=1)
X_predict = X_predict.drop("FTAG",axis=1)
X_predict = X_predict.drop("FTR",axis=1)
#Remove columns that are not in both X_train and X_predict tables
X_train = X_train.drop(columns=[col for col in X_train if col not in list(X_predict.columns)])
X_predict = X_predict.drop(columns=[col for col in X_predict if col not in list(X_train.columns)])
#Create model of RandomForestClassifier with parameters that we found to be the most effective on test data 
model = RandomForestClassifier(n_estimators=150, criterion="gini", max_depth=5, random_state=0)
#Train model and predict with it
model.fit(X_train, y_train)
prediction=model.predict(X_predict)
#Insert predicted data into final results table
for i in range(prediction.shape[0]):
    home = predict["HomeTeam"].iloc[i]
    away = predict["AwayTeam"].iloc[i]
    #Add 1 point to both teams in case of draw
    if (prediction[i][2] == 0):
        predicted_table.loc[predicted_table["Team"] == home, ["Points"]] += 1
        predicted_table.loc[predicted_table["Team"] == away, ["Points"]] += 1
    #Otherwise add 3 points to winner
    elif (prediction[i][2] == 1):
        predicted_table.loc[predicted_table["Team"] == home, ["Points"]] += 3
    else:
        predicted_table.loc[predicted_table["Team"] == away, ["Points"]] += 3
    #Add goals for home team to "home team for goals" and "home team goal difference"
    predicted_table.loc[predicted_table["Team"] == home, ["Goals for"]] += prediction[i][0]
    predicted_table.loc[predicted_table["Team"] == home, ["Goal difference"]] += prediction[i][0]
    #Add goals for home team to "away team against goals" and substract from "away team goal difference"
    predicted_table.loc[predicted_table["Team"] == away, ["Goals against"]] += prediction[i][0]
    predicted_table.loc[predicted_table["Team"] == away, ["Goal difference"]] -= prediction[i][0]
    #Add goals for away team to "away team for goals" and "away team goal difference"
    predicted_table.loc[predicted_table["Team"] == away, ["Goals for"]] += prediction[i][1]
    predicted_table.loc[predicted_table["Team"] == away, ["Goal difference"]] += prediction[i][1]
    #Add goals for away team to "home team against goals" and substract from "home team goal difference"
    predicted_table.loc[predicted_table["Team"] == home, ["Goals against"]] += prediction[i][1]
    predicted_table.loc[predicted_table["Team"] == home, ["Goal difference"]] -= prediction[i][1]
#Sort table first by "Points" column and then by "Goal difference" column to reflect final standings for the predicted football season
predicted_table = predicted_table.sort_values(by=["Points", "Goal difference"], ascending=False)
#Change index to start from 1 instead of 0, to make positions easier to read
predicted_table.index = np.arange(1, len(predicted_table)+1)

In [5]:
predicted_table

Unnamed: 0,Team,Points,Goal difference,Goals for,Goals against
1,Liverpool,111,58,69,11
2,Man City,111,49,52,3
3,Chelsea,96,41,57,16
4,Tottenham,93,27,49,22
5,Man United,90,30,52,22
6,Arsenal,90,25,50,25
7,Wolves,63,-3,27,30
8,Leicester,60,-2,29,31
9,Everton,57,3,32,29
10,West Ham,48,-10,23,33


In [6]:
#Other tested learning algorithms that worked pretty well, 
#but performed slightly worse than RandomForestClassifier on test data:
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

model_1 = KNeighborsClassifier(n_neighbors = 15)
model_2 = DecisionTreeClassifier(criterion="gini", max_depth=5, min_samples_leaf=500)
model_3 = BernoulliNB(alpha=11)