# Imports

In [399]:
import pandas as pd
import numpy as np

# Data preprocessing

In [400]:
results = pd.read_csv('csv_data/results.csv')

In [401]:
results.head(10)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,E0,14/08/10,Aston Villa,West Ham,3.0,0.0,H,2.0,0.0,H,...,11.0,2.0,15.0,15.0,16.0,7.0,1.0,2.0,0.0,0.0
1,E0,14/08/10,Blackburn,Everton,1.0,0.0,H,1.0,0.0,H,...,2.0,12.0,19.0,14.0,1.0,3.0,2.0,1.0,0.0,0.0
2,E0,14/08/10,Bolton,Fulham,0.0,0.0,D,0.0,0.0,D,...,9.0,7.0,12.0,13.0,4.0,8.0,1.0,3.0,0.0,0.0
3,E0,14/08/10,Chelsea,West Brom,6.0,0.0,H,2.0,0.0,H,...,13.0,4.0,10.0,10.0,3.0,1.0,1.0,0.0,0.0,0.0
4,E0,14/08/10,Sunderland,Birmingham,2.0,2.0,D,1.0,0.0,H,...,2.0,7.0,13.0,10.0,3.0,6.0,3.0,3.0,1.0,0.0
5,E0,14/08/10,Tottenham,Man City,0.0,0.0,D,0.0,0.0,D,...,18.0,7.0,13.0,16.0,10.0,3.0,0.0,2.0,0.0,0.0
6,E0,14/08/10,Wigan,Blackpool,0.0,4.0,A,0.0,3.0,A,...,6.0,7.0,8.0,11.0,6.0,4.0,1.0,1.0,0.0,0.0
7,E0,14/08/10,Wolves,Stoke,2.0,1.0,H,2.0,0.0,H,...,7.0,6.0,17.0,13.0,5.0,5.0,0.0,2.0,0.0,0.0
8,E0,15/08/10,Liverpool,Arsenal,1.0,1.0,D,0.0,0.0,D,...,4.0,7.0,13.0,15.0,9.0,11.0,1.0,3.0,1.0,1.0
9,E0,16/08/10,Man United,Newcastle,3.0,0.0,H,2.0,0.0,H,...,10.0,3.0,9.0,5.0,5.0,3.0,2.0,2.0,0.0,0.0


In [402]:
results.loc[results['Div'] == 'E0', 'Div'] = 'E1'    # first English division should be called 'E1' instead of 'E0'

## Handling missing values

In [403]:
results[results['HY'].isnull()]

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
1900,,,,,,,,,,,...,,,,,,,,,,
6969,I1,23/09/12,Cagliari,Roma,0.0,3.0,A,,,,...,,,,,,,,,,
7315,,,,,,,,,,,...,,,,,,,,,,
7316,,,,,,,,,,,...,,,,,,,,,,
7317,,,,,,,,,,,...,,,,,,,,,,
7945,I1,02/03/15,Roma,Juventus,1.0,1.0,D,0.0,0.0,D,...,3.0,1.0,11.0,16.0,1.0,5.0,,5.0,1.0,0.0
8078,,,,,,,,,,,...,,,,,,,,,,
8459,,,,,,,,,,,...,,,,,,,,,,


In [404]:
results = results.dropna(axis='index', how='all')  # drop all rows that only contain NaN values

In [405]:
results.loc[7945, 'HY'] = 6    # add the missing value by hand

In [406]:
results.drop(labels=6969, axis='index', inplace=True)  # remove row that misses too many values

In [407]:
results[results['HTHG'].isnull()]

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
8477,I1,28/08/16,Sassuolo,Pescara,0.0,3.0,A,,,,...,3.0,5.0,12.0,21.0,2.0,7.0,2.0,2.0,0.0,0.0


In [408]:
results.drop(labels=8477, axis='index', inplace=True)  # remove game with non-representative result

In [409]:
# all missing values have been dealt with by this point
for col in results.columns:
    print(col, len(results[results[col].isnull()]))

Div 0
Date 0
HomeTeam 0
AwayTeam 0
FTHG 0
FTAG 0
FTR 0
HTHG 0
HTAG 0
HTR 0
HS 0
AS 0
HST 0
AST 0
HF 0
AF 0
HC 0
AC 0
HY 0
AY 0
HR 0
AR 0


## Datatypes

In [410]:
# convert float columns to int
for col in results.columns:
    if results[col].dtype == np.float64:
        results[col] = results[col].astype(int)

In [411]:
results.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,E1,14/08/10,Aston Villa,West Ham,3,0,H,2,0,H,...,11,2,15,15,16,7,1,2,0,0
1,E1,14/08/10,Blackburn,Everton,1,0,H,1,0,H,...,2,12,19,14,1,3,2,1,0,0
2,E1,14/08/10,Bolton,Fulham,0,0,D,0,0,D,...,9,7,12,13,4,8,1,3,0,0
3,E1,14/08/10,Chelsea,West Brom,6,0,H,2,0,H,...,13,4,10,10,3,1,1,0,0,0
4,E1,14/08/10,Sunderland,Birmingham,2,2,D,1,0,H,...,2,7,13,10,3,6,3,3,1,0


In [412]:
# convert Date column to proper datetime format
results['Date'] = pd.to_datetime(results['Date']) #.dt.strftime('%d/%m/%Y')    

In [413]:
results.sample(5)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
795,E1,2012-09-15,Stoke,Man City,1,1,D,1,1,D,...,3,11,12,12,6,5,0,1,0,0
10353,SP1,2012-05-13,Getafe,Zaragoza,0,2,A,0,0,D,...,0,4,22,18,0,6,2,4,3,1
9520,I1,2019-06-04,Parma,Torino,0,0,D,0,0,D,...,1,4,20,21,3,4,4,5,0,0
11921,SP1,2016-09-20,Sevilla,Betis,1,0,H,0,0,D,...,3,3,24,17,6,2,5,4,0,0
2535,E1,2017-02-25,Watford,West Ham,1,1,D,1,0,H,...,1,2,18,14,4,6,4,1,0,1


In [414]:
results.dtypes

Div                 object
Date        datetime64[ns]
HomeTeam            object
AwayTeam            object
FTHG                 int32
FTAG                 int32
FTR                 object
HTHG                 int32
HTAG                 int32
HTR                 object
HS                   int32
AS                   int32
HST                  int32
AST                  int32
HF                   int32
AF                   int32
HC                   int32
AC                   int32
HY                   int32
AY                   int32
HR                   int32
AR                   int32
dtype: object

Preprocessing should be finished now.