# Imports

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

# Data preprocessing

In [2]:
results = pd.read_csv('csv_data/results.csv')

In [3]:
results.head(10)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,E0,14/08/10,Aston Villa,West Ham,3.0,0.0,H,2.0,0.0,H,...,11.0,2.0,15.0,15.0,16.0,7.0,1.0,2.0,0.0,0.0
1,E0,14/08/10,Blackburn,Everton,1.0,0.0,H,1.0,0.0,H,...,2.0,12.0,19.0,14.0,1.0,3.0,2.0,1.0,0.0,0.0
2,E0,14/08/10,Bolton,Fulham,0.0,0.0,D,0.0,0.0,D,...,9.0,7.0,12.0,13.0,4.0,8.0,1.0,3.0,0.0,0.0
3,E0,14/08/10,Chelsea,West Brom,6.0,0.0,H,2.0,0.0,H,...,13.0,4.0,10.0,10.0,3.0,1.0,1.0,0.0,0.0,0.0
4,E0,14/08/10,Sunderland,Birmingham,2.0,2.0,D,1.0,0.0,H,...,2.0,7.0,13.0,10.0,3.0,6.0,3.0,3.0,1.0,0.0
5,E0,14/08/10,Tottenham,Man City,0.0,0.0,D,0.0,0.0,D,...,18.0,7.0,13.0,16.0,10.0,3.0,0.0,2.0,0.0,0.0
6,E0,14/08/10,Wigan,Blackpool,0.0,4.0,A,0.0,3.0,A,...,6.0,7.0,8.0,11.0,6.0,4.0,1.0,1.0,0.0,0.0
7,E0,14/08/10,Wolves,Stoke,2.0,1.0,H,2.0,0.0,H,...,7.0,6.0,17.0,13.0,5.0,5.0,0.0,2.0,0.0,0.0
8,E0,15/08/10,Liverpool,Arsenal,1.0,1.0,D,0.0,0.0,D,...,4.0,7.0,13.0,15.0,9.0,11.0,1.0,3.0,1.0,1.0
9,E0,16/08/10,Man United,Newcastle,3.0,0.0,H,2.0,0.0,H,...,10.0,3.0,9.0,5.0,5.0,3.0,2.0,2.0,0.0,0.0


In [4]:
results.loc[results['Div'] == 'E0', 'Div'] = 'E1'    # first English division should be called 'E1' instead of 'E0'

## Handling missing values

In [5]:
results[results['HY'].isnull()]

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
1900,,,,,,,,,,,...,,,,,,,,,,
6969,I1,23/09/12,Cagliari,Roma,0.0,3.0,A,,,,...,,,,,,,,,,
7315,,,,,,,,,,,...,,,,,,,,,,
7316,,,,,,,,,,,...,,,,,,,,,,
7317,,,,,,,,,,,...,,,,,,,,,,
7945,I1,02/03/15,Roma,Juventus,1.0,1.0,D,0.0,0.0,D,...,3.0,1.0,11.0,16.0,1.0,5.0,,5.0,1.0,0.0
8078,,,,,,,,,,,...,,,,,,,,,,
8459,,,,,,,,,,,...,,,,,,,,,,


In [6]:
results = results.dropna(axis='index', how='all')  # drop all rows that only contain NaN values

In [7]:
results.loc[7945, 'HY'] = 6    # add the missing value by hand

In [8]:
results.drop(labels=6969, axis='index', inplace=True)  # remove row that misses too many values

In [9]:
results[results['HTHG'].isnull()]

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
8477,I1,28/08/16,Sassuolo,Pescara,0.0,3.0,A,,,,...,3.0,5.0,12.0,21.0,2.0,7.0,2.0,2.0,0.0,0.0


In [10]:
results.drop(labels=8477, axis='index', inplace=True)  # remove game with non-representative result

In [11]:
# all missing values have been dealt with by this point
for col in results.columns:
    print(col, len(results[results[col].isnull()]))

Div 0
Date 0
HomeTeam 0
AwayTeam 0
FTHG 0
FTAG 0
FTR 0
HTHG 0
HTAG 0
HTR 0
HS 0
AS 0
HST 0
AST 0
HF 0
AF 0
HC 0
AC 0
HY 0
AY 0
HR 0
AR 0


## Datatypes

In [12]:
# convert float columns to int
for col in results.columns:
    if results[col].dtype == np.float64:
        results[col] = results[col].astype(int)

In [13]:
results.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,E1,14/08/10,Aston Villa,West Ham,3,0,H,2,0,H,...,11,2,15,15,16,7,1,2,0,0
1,E1,14/08/10,Blackburn,Everton,1,0,H,1,0,H,...,2,12,19,14,1,3,2,1,0,0
2,E1,14/08/10,Bolton,Fulham,0,0,D,0,0,D,...,9,7,12,13,4,8,1,3,0,0
3,E1,14/08/10,Chelsea,West Brom,6,0,H,2,0,H,...,13,4,10,10,3,1,1,0,0,0
4,E1,14/08/10,Sunderland,Birmingham,2,2,D,1,0,H,...,2,7,13,10,3,6,3,3,1,0


In [14]:
# convert Date column to proper datetime format
results['Date'] = pd.to_datetime(results['Date']) #.dt.strftime('%d/%m/%Y')    

In [15]:
results.sample(5)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
968,E1,2013-12-01,Aston Villa,Southampton,0,1,A,0,1,A,...,10,4,9,6,9,5,1,1,0,0
11440,SP1,2015-04-24,Cordoba,Ath Bilbao,0,1,A,0,0,D,...,2,4,13,19,9,5,2,3,0,0
8058,I1,2015-05-23,Genoa,Inter,3,2,H,2,2,D,...,7,5,14,10,13,5,3,4,0,0
12274,SP1,2017-08-26,Las Palmas,Ath Madrid,1,5,A,0,2,A,...,4,6,10,9,4,2,4,1,0,0
9930,SP1,2011-04-30,La Coruna,Ath Madrid,0,1,A,0,0,D,...,3,5,16,8,9,6,1,4,1,0


In [16]:
results.dtypes

Div                 object
Date        datetime64[ns]
HomeTeam            object
AwayTeam            object
FTHG                 int32
FTAG                 int32
FTR                 object
HTHG                 int32
HTAG                 int32
HTR                 object
HS                   int32
AS                   int32
HST                  int32
AST                  int32
HF                   int32
AF                   int32
HC                   int32
AC                   int32
HY                   int32
AY                   int32
HR                   int32
AR                   int32
dtype: object

In [17]:
# Assign an ID to each team for easier identification
team_ids = defaultdict()

assert len(results['HomeTeam'].unique()) == len(results['AwayTeam'].unique())

teams = results['HomeTeam'].unique()

index = 0
for team in teams:
    team_ids[team] = index
    print(index, team)    
    index += 1



0 Aston Villa
1 Blackburn
2 Bolton
3 Chelsea
4 Sunderland
5 Tottenham
6 Wigan
7 Wolves
8 Liverpool
9 Man United
10 Arsenal
11 Birmingham
12 Everton
13 Stoke
14 West Brom
15 West Ham
16 Fulham
17 Newcastle
18 Man City
19 Blackpool
20 QPR
21 Swansea
22 Norwich
23 Reading
24 Southampton
25 Crystal Palace
26 Hull
27 Cardiff
28 Leicester
29 Burnley
30 Bournemouth
31 Watford
32 Middlesbrough
33 Brighton
34 Huddersfield
35 Bayern Munich
36 FC Koln
37 Freiburg
38 Hamburg
39 Hannover
40 Hoffenheim
41 M'gladbach
42 Dortmund
43 Mainz
44 Kaiserslautern
45 Ein Frankfurt
46 Nurnberg
47 Schalke 04
48 St Pauli
49 Werder Bremen
50 Wolfsburg
51 Leverkusen
52 Stuttgart
53 Augsburg
54 Hertha
55 Greuther Furth
56 Fortuna Dusseldorf
57 Braunschweig
58 Paderborn
59 Darmstadt
60 Ingolstadt
61 RB Leipzig
62 Roma
63 Udinese
64 Bari
65 Chievo
66 Fiorentina
67 Milan
68 Palermo
69 Parma
70 Sampdoria
71 Bologna
72 Cagliari
73 Cesena
74 Inter
75 Brescia
76 Catania
77 Genoa
78 Juventus
79 Lazio
80 Lecce
81 Napoli
82 

In [18]:
# Convert team name columns to IDs
#results['HomeID'] = results['HomeTeam']
#results['AwayID'] = results['AwayTeam']
results.rename(columns={'HomeTeam': 'HomeID', 'AwayTeam': 'AwayID'}, inplace=True)
results['HomeID'].replace(team_ids, inplace=True)
results['AwayID'].replace(team_ids, inplace=True)

In [19]:
results.sample(6)

Unnamed: 0,Div,Date,HomeID,AwayID,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
10468,SP1,2012-11-11,101,107,2,4,A,0,3,A,...,3,5,18,6,6,6,1,1,0,0
4211,D1,2013-03-02,51,42,2,3,A,0,2,A,...,8,7,21,16,12,4,5,1,0,0
1039,E1,2013-04-03,0,18,0,1,A,0,1,A,...,5,12,10,14,6,5,2,0,0,0
12681,SP1,2018-09-22,122,125,1,0,H,0,0,D,...,3,2,16,17,5,2,1,1,0,0
4750,D1,2014-11-22,47,50,3,2,H,3,1,H,...,6,2,10,16,2,7,1,1,0,0
5120,D1,2016-01-31,35,40,2,0,H,1,0,H,...,9,2,11,11,10,2,1,1,0,0


Preprocessing should be finished now, therefore we save the obtained data.

In [20]:
results.to_pickle('preprocessed_results.pkl')