# Imports

In [2]:
import pandas as pd
import numpy as np

# Data preprocessing

In [3]:
results = pd.read_csv('csv_data/results.csv')

In [4]:
results.head(10)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,E0,14/08/10,Aston Villa,West Ham,3.0,0.0,H,2.0,0.0,H,...,11.0,2.0,15.0,15.0,16.0,7.0,1.0,2.0,0.0,0.0
1,E0,14/08/10,Blackburn,Everton,1.0,0.0,H,1.0,0.0,H,...,2.0,12.0,19.0,14.0,1.0,3.0,2.0,1.0,0.0,0.0
2,E0,14/08/10,Bolton,Fulham,0.0,0.0,D,0.0,0.0,D,...,9.0,7.0,12.0,13.0,4.0,8.0,1.0,3.0,0.0,0.0
3,E0,14/08/10,Chelsea,West Brom,6.0,0.0,H,2.0,0.0,H,...,13.0,4.0,10.0,10.0,3.0,1.0,1.0,0.0,0.0,0.0
4,E0,14/08/10,Sunderland,Birmingham,2.0,2.0,D,1.0,0.0,H,...,2.0,7.0,13.0,10.0,3.0,6.0,3.0,3.0,1.0,0.0
5,E0,14/08/10,Tottenham,Man City,0.0,0.0,D,0.0,0.0,D,...,18.0,7.0,13.0,16.0,10.0,3.0,0.0,2.0,0.0,0.0
6,E0,14/08/10,Wigan,Blackpool,0.0,4.0,A,0.0,3.0,A,...,6.0,7.0,8.0,11.0,6.0,4.0,1.0,1.0,0.0,0.0
7,E0,14/08/10,Wolves,Stoke,2.0,1.0,H,2.0,0.0,H,...,7.0,6.0,17.0,13.0,5.0,5.0,0.0,2.0,0.0,0.0
8,E0,15/08/10,Liverpool,Arsenal,1.0,1.0,D,0.0,0.0,D,...,4.0,7.0,13.0,15.0,9.0,11.0,1.0,3.0,1.0,1.0
9,E0,16/08/10,Man United,Newcastle,3.0,0.0,H,2.0,0.0,H,...,10.0,3.0,9.0,5.0,5.0,3.0,2.0,2.0,0.0,0.0


In [5]:
results.loc[results['Div'] == 'E0', 'Div'] = 'E1'    # first English division should be called 'E1' instead of 'E0'

## Handling missing values

In [6]:
results[results['HY'].isnull()]

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
1900,,,,,,,,,,,...,,,,,,,,,,
6969,I1,23/09/12,Cagliari,Roma,0.0,3.0,A,,,,...,,,,,,,,,,
7315,,,,,,,,,,,...,,,,,,,,,,
7316,,,,,,,,,,,...,,,,,,,,,,
7317,,,,,,,,,,,...,,,,,,,,,,
7945,I1,02/03/15,Roma,Juventus,1.0,1.0,D,0.0,0.0,D,...,3.0,1.0,11.0,16.0,1.0,5.0,,5.0,1.0,0.0
8078,,,,,,,,,,,...,,,,,,,,,,
8459,,,,,,,,,,,...,,,,,,,,,,


In [7]:
results = results.dropna(axis='index', how='all')  # drop all rows that only contain NaN values

In [8]:
results.loc[7945, 'HY'] = 6    # add the missing value by hand

In [9]:
results.drop(labels=6969, axis='index', inplace=True)  # remove row that misses too many values

In [10]:
results[results['HTHG'].isnull()]

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
8477,I1,28/08/16,Sassuolo,Pescara,0.0,3.0,A,,,,...,3.0,5.0,12.0,21.0,2.0,7.0,2.0,2.0,0.0,0.0


In [11]:
results.drop(labels=8477, axis='index', inplace=True)  # remove game with non-representative result

In [12]:
# all missing values have been dealt with by this point
for col in results.columns:
    print(col, len(results[results[col].isnull()]))

Div 0
Date 0
HomeTeam 0
AwayTeam 0
FTHG 0
FTAG 0
FTR 0
HTHG 0
HTAG 0
HTR 0
HS 0
AS 0
HST 0
AST 0
HF 0
AF 0
HC 0
AC 0
HY 0
AY 0
HR 0
AR 0


## Datatypes

In [13]:
# convert float columns to int
for col in results.columns:
    if results[col].dtype == np.float64:
        results[col] = results[col].astype(int)

In [14]:
results.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
0,E1,14/08/10,Aston Villa,West Ham,3,0,H,2,0,H,...,11,2,15,15,16,7,1,2,0,0
1,E1,14/08/10,Blackburn,Everton,1,0,H,1,0,H,...,2,12,19,14,1,3,2,1,0,0
2,E1,14/08/10,Bolton,Fulham,0,0,D,0,0,D,...,9,7,12,13,4,8,1,3,0,0
3,E1,14/08/10,Chelsea,West Brom,6,0,H,2,0,H,...,13,4,10,10,3,1,1,0,0,0
4,E1,14/08/10,Sunderland,Birmingham,2,2,D,1,0,H,...,2,7,13,10,3,6,3,3,1,0


In [15]:
# convert Date column to proper datetime format
results['Date'] = pd.to_datetime(results['Date']) #.dt.strftime('%d/%m/%Y')    

In [16]:
results.sample(5)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
6458,I1,2011-03-13,Catania,Sampdoria,1,0,H,0,0,D,...,2,1,24,21,6,0,4,2,0,1
4451,D1,2013-11-23,Ein Frankfurt,Schalke 04,3,3,D,0,2,A,...,8,9,22,20,5,1,4,0,0,0
5383,D1,2016-12-16,Hoffenheim,Dortmund,2,2,D,2,1,H,...,5,4,22,18,2,4,5,1,0,1
2628,E1,2017-06-05,Hull,Sunderland,0,2,A,0,0,D,...,6,5,10,8,7,4,3,1,0,0
434,E1,2011-09-24,Stoke,Man United,1,1,D,0,1,A,...,8,9,10,3,8,8,1,0,0,0


In [17]:
results.dtypes

Div                 object
Date        datetime64[ns]
HomeTeam            object
AwayTeam            object
FTHG                 int64
FTAG                 int64
FTR                 object
HTHG                 int64
HTAG                 int64
HTR                 object
HS                   int64
AS                   int64
HST                  int64
AST                  int64
HF                   int64
AF                   int64
HC                   int64
AC                   int64
HY                   int64
AY                   int64
HR                   int64
AR                   int64
dtype: object

Preprocessing should be finished now.

# Data exploration

In [27]:
results[['FTHG', 'FTAG']].groupby(['FTHG', 'FTAG']).count()

FTHG,FTAG
0,0
0,1
0,2
0,3
0,4
0,5
0,6
0,7
0,8
1,0
