<a href="https://colab.research.google.com/github/mndore/football_regression/blob/main/football_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reading the data from FiveThirtyEight

In [2]:
import pandas as pd
import numpy as np
full_data = pd.read_csv('https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv')
full_data.head()

Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,probtie,proj_score1,proj_score2,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
0,2016,2016-07-09,7921,FA Women's Super League,Liverpool Women,Reading,51.56,50.42,0.4389,0.2767,0.2844,1.39,1.05,,,2.0,0.0,,,,,,
1,2016,2016-07-10,7921,FA Women's Super League,Arsenal Women,Notts County Ladies,46.61,54.03,0.3572,0.3608,0.2819,1.27,1.28,,,2.0,0.0,,,,,,
2,2016,2016-07-10,7921,FA Women's Super League,Chelsea FC Women,Birmingham City,59.85,54.64,0.4799,0.2487,0.2714,1.53,1.03,,,1.0,1.0,,,,,,
3,2016,2016-07-16,7921,FA Women's Super League,Liverpool Women,Notts County Ladies,53.0,52.35,0.4289,0.2699,0.3013,1.27,0.94,,,0.0,0.0,,,,,,
4,2016,2016-07-17,7921,FA Women's Super League,Chelsea FC Women,Arsenal Women,59.43,60.99,0.4124,0.3157,0.2719,1.45,1.24,,,1.0,2.0,,,,,,


Remove rows with missing values (i.e. matches in the future) and fill missing values with mean

In [3]:
from datetime import date

today = date.today()
full_data['importance1'].fillna((full_data['importance1'].mean()), inplace=True)
full_data['importance2'].fillna((full_data['importance2'].mean()), inplace=True)
data = full_data.query('date < "'+today.strftime("%Y-%m-%d")+'"')
data = data.fillna(data.mean())

**Overview** of the dataset


In [4]:
data.describe()

Unnamed: 0,season,league_id,spi1,spi2,prob1,prob2,probtie,proj_score1,proj_score2,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
count,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0,37231.0
mean,2018.200693,2186.122505,44.909883,44.865246,0.448905,0.298957,0.252138,1.520272,1.167155,31.686656,30.95931,1.521569,1.186152,1.500421,1.176356,1.405366,1.137452,1.538834,1.19831
std,1.087773,910.210923,18.860951,18.884358,0.160085,0.144712,0.048868,0.432875,0.426209,24.051642,23.744748,1.272865,1.133818,0.602881,0.535968,0.473437,0.417129,0.902235,0.818799
min,2016.0,1818.0,3.88,4.04,0.0271,0.0032,0.0,0.25,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2017.0,1849.0,31.21,31.18,0.3474,0.2043,0.2336,1.24,0.9,14.6,14.0,1.0,0.0,1.33,1.01,1.28,1.02,1.05,1.05
50%,2018.0,1874.0,42.84,42.75,0.4401,0.2828,0.2604,1.46,1.12,31.716133,30.997875,1.0,1.0,1.500421,1.176356,1.405366,1.137452,1.538834,1.19831
75%,2019.0,2160.0,57.97,57.94,0.5392,0.3744,0.2817,1.73,1.38,41.7,40.7,2.0,2.0,1.500421,1.176356,1.405366,1.137452,1.538834,1.19831
max,2020.0,9541.0,96.57,96.78,0.9775,0.8992,0.4537,4.9,4.01,100.0,100.0,11.0,13.0,7.07,8.27,6.89,7.17,9.15,11.05


Combine data with already processed data. Calculating the added features (see below) takes up a lot of time, therefore I saved a .csv file containing these features so they only have to be generated for new datapoints.

In [5]:
generated_features = pd.read_csv('generated_features.csv')

data['goalsLastThree1'] = generated_features['goalsLastThree1']
data['goalsLastThree2'] = generated_features['goalsLastThree2']

data['averageScore1'] = generated_features['averageScore1']
data['averageScore2'] = generated_features['averageScore2']
print('Done')

Done


Add features
1. Average amount of goals over last 3 games in that league
   - goalsLastThree1
   - goalsLastThree2
2. Average amount of goals scored against that opponent
   - averageScore1
   - averageScore2

In [6]:
new_features = ['averageScore1','averageScore2','goalsLastThree1','goalsLastThree2'] 

def goalsLastThree(data, team, gameDate, league):
    tempData = data.query('date < "' + gameDate + '" and (team1 == "'+str(team)+'" or team2 == "'+str(team)+'") and league_id == "'+str(league)+'"')
    totalScore = 0
    if len(tempData.index) >= 3:
        tempData = tempData[-3:]
        for index, game in tempData.iterrows():
            if game['team1'] == team:
                totalScore += game['score1']
            else:
                totalScore += game['score2']
        totalScore/3
    return totalScore

def averageScore(data,team1,team2,gameDate,league):
  tgoals = data.query('team1 == "'+team1+'" and team2 == "'+team2+'"')
  return (tgoals['score1'].mean(), tgoals['score2'].mean())

for col in new_features:
    subsetData = data[data[col].isnull()]
    for index, row in subsetData.iterrows():
        game = subsetData.loc[index]
        if col == "goalsLastThree1":
            data.at[index,'goalsLastThree1'] = goalsLastThree(data,game['team1'],game['date'],game['league_id'])
        if col == "goalsLastThree2":
            data.at[index,'goalsLastThree2'] = goalsLastThree(data,game['team2'],game['date'],game['league_id'])
        if col == "averageScore1":
            avg1, _ = averageScore(data,game['team1'],game['team2'],game['date'],game['league_id'])
            data.at[index,'averageScore1'] = avg1
        if col == "averageScore2":
            _, avg2 = averageScore(data,game['team1'],game['team2'],game['date'],game['league_id'])
            data.at[index,'averageScore2'] = avg2
data.to_csv('generated_features.csv')
print('Done')

Done


In [7]:
categorical_features = ['team1',
 'team2']
numerical_features = ['spi1', 'spi2', 'prob1', 'prob2','probtie',
 'proj_score1',
 'proj_score2',
 'importance1',
 'importance2'] + new_features

In [8]:
# Separate features and labels
X, y = data, np.array(list(map(list,zip(data['score1'],data['score2']))))
print('Features:',X[:10], '\nLabels:', y[:10], sep='\n')

Features:
   season        date  league_id  ... goalsLastThree2 averageScore1 averageScore2
0    2016  2016-07-09       7921  ...             0.0          0.50          0.50
1    2016  2016-07-10       7921  ...             0.0          2.00          2.00
2    2016  2016-07-10       7921  ...             0.0          1.75          1.75
3    2016  2016-07-16       7921  ...             0.0          0.00          0.00
4    2016  2016-07-17       7921  ...             0.0          1.50          1.50
5    2016  2016-07-24       7921  ...             0.0          0.80          0.80
6    2016  2016-07-24       7921  ...             0.0          1.00          1.00
7    2016  2016-07-31       7921  ...             1.0          1.00          1.00
8    2016  2016-07-31       7921  ...             0.0          2.50          2.50
9    2016  2016-08-03       7921  ...             0.0          1.40          1.40

[10 rows x 27 columns]

Labels:
[[2. 0.]
 [2. 0.]
 [1. 1.]
 [0. 0.]
 [1. 2.]
 [1. 1.]
 

In [9]:
from sklearn.model_selection import train_test_split

# Split data 70%-30% into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
print ('Training Set: %d, rows\nTest Set: %d rows' % (X_train.size, X_test.size))

Training Set: 703647, rows
Test Set: 301590 rows


In [10]:
from sklearn.neural_network import MLPRegressor
regr = MLPRegressor(max_iter=1000,hidden_layer_sizes = (8,7,6)).fit(X_train[numerical_features].values, y_train)
regr

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(8, 7, 6), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=1000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [11]:
X_test[:5]

Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,probtie,proj_score1,proj_score2,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2,goalsLastThree1,goalsLastThree2,averageScore1,averageScore2
33192,2020,2020-07-27,1951,Major League Soccer,Seattle Sounders FC,Los Angeles FC,44.21,56.86,0.3706,0.6294,0.0,1.4,1.93,100.0,100.0,1.0,4.0,1.67,4.5,0.99,2.41,1.05,3.4,4.0,11.0,1.833333,1.833333
34472,2020,2020-09-20,1859,Norwegian Tippeligaen,SK Brann,Bodo/Glimt,23.76,56.99,0.1715,0.6448,0.1837,1.24,2.5,6.5,10.1,1.0,3.0,1.500421,1.176356,1.405366,1.137452,1.538834,1.19831,4.0,4.0,0.4,0.4
34389,2020,2020-09-19,1952,Mexican Primera Division Torneo Apertura,Atlas,Pachuca,37.89,49.37,0.3258,0.3753,0.2988,1.09,1.19,36.2,10.1,0.0,1.0,0.56,2.28,0.93,1.12,0.0,0.84,3.0,2.0,4.0,4.0
19033,2018,2019-03-02,1983,South African ABSA Premier League,Golden Arrows,Orlando Pirates,24.66,37.37,0.2251,0.5094,0.2655,1.08,1.74,4.8,86.0,0.0,1.0,1.500421,1.176356,1.405366,1.137452,1.538834,1.19831,3.0,5.0,2.0,2.0
8119,2017,2018-02-22,1820,UEFA Europa League,Atletico Madrid,FC Copenhagen,84.84,54.68,0.8115,0.0273,0.1613,2.08,0.2,100.0,0.0,1.0,0.0,1.15,0.3,2.85,0.84,1.05,0.0,7.0,3.0,0.0,0.0


Calculate the accuracy of predicting the correct winner or draw on the testset

In [12]:
exact = 0
winner = 0
t = 0
def r(l): return list(map(round,l))

for index, prediction in enumerate(regr.predict(X_test[numerical_features])):
    h_pred, a_pred = r(prediction)
    h_real, a_real = [y_test[index][0],y_test[index][1]]
    if h_pred == h_real and a_pred == a_real: exact+=1
    
    if h_pred >= a_pred and h_real >= a_real: winner+=1
    elif h_pred < a_pred and h_real < a_real: winner+=1
    elif h_pred == a_pred and h_real == a_real: winner+=1
    t += 1
    
print("Percentage exact score correct: {0}%".format(100*round(exact/t,2)))
print("Percentage winner correct: {0}%".format(100*round(winner/t,2)))
print("R2 value of MLPregressor: {0}".format(round(regr.score(X_test[numerical_features], y_test),2)))

Percentage exact score correct: 12.0%
Percentage winner correct: 72.0%
R2 value of MLPregressor: 0.13




Predict the outcome of the games of coming weekend


In [13]:
from math import floor
import datetime

this_week = full_data.query('league == "Dutch Eredivisie" & date >= "'+ (date.today()-datetime.timedelta(days=7)).strftime("%Y-%m-%d") +'" &  date < "'+ (date.today() + datetime.timedelta(days=7)).strftime("%Y-%m-%d") +'"')

this_week.loc[:,'goalsLastThree1'] = np.nan
this_week.loc[:,'goalsLastThree2'] = np.nan
this_week.loc[:,'realScore1'] = np.nan
this_week.loc[:,'realScore2'] = np.nan
this_week.loc[:,'correct'] = np.nan

i_list = this_week.index.values.tolist()

for i in i_list:
    this_week.at[i,'goalsLastThree1'] = goalsLastThree(data, this_week.loc[i]['team1'],this_week.loc[i]['date'],this_week.loc[i]['league_id'])
    this_week.at[i,'goalsLastThree2'] = goalsLastThree(data, this_week.loc[i]['team2'],this_week.loc[i]['date'],this_week.loc[i]['league_id'])
    avg1, avg2 = averageScore(data,this_week.loc[i]['team1'],this_week.loc[i]['team2'],this_week.loc[i]['date'],this_week.loc[i]['league_id'])
    this_week.at[i,'averageScore1'] = avg1
    this_week.at[i,'averageScore2'] = avg2
    this_week.at[i, 'realScore1'] = this_week.loc[i]['score1']
    this_week.at[i, 'realScore2'] = this_week.loc[i]['score2']


pred = regr.predict(this_week[numerical_features])
    
for index,(h,a) in enumerate(pred):
    this_week.at[i_list[index],'score1'] = round(h,2)
    this_week.at[i_list[index],'score2'] = round(a,2)
    h_real = this_week.loc[i_list[index]]['realScore1']
    a_real = this_week.loc[i_list[index]]['realScore2']
    if (h_real > a_real and round(h) > round(a)) or (h_real < a_real and round(h) < round(a)) or (h_real == a_real and round(h) == round(a)):
      this_week.at[i_list[index], 'correct'] = 1
this_week[['team1','team2','score1','score2','realScore1','realScore2', 'correct', 'date']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0,team1,team2,score1,score2,realScore1,realScore2,correct,date
36974,Emmen,Ajax,1.01,2.5,0.0,5.0,1.0,2020-11-28
36984,ADO Den Haag,Heerenveen,1.41,1.47,1.0,1.0,1.0,2020-11-28
36989,VVV Venlo,PEC Zwolle,1.3,1.31,2.0,2.0,1.0,2020-11-28
37008,Vitesse,Fortuna Sittard,2.22,0.86,2.0,0.0,1.0,2020-11-29
37025,Feyenoord,FC Utrecht,1.54,0.94,1.0,1.0,,2020-11-29
37026,FC Groningen,Willem II,1.12,0.95,1.0,0.0,,2020-11-29
37046,Heracles,AZ,1.26,1.79,1.0,2.0,1.0,2020-11-29
37068,PSV,Sparta,2.07,0.75,1.0,0.0,1.0,2020-11-29
37220,Sparta,Emmen,1.82,1.02,2.0,1.0,1.0,2020-12-04
37301,RKC,VVV Venlo,1.45,1.21,,,,2020-12-05
