# Football UK Data
Preprocessing the data from: https://www.football-data.co.uk/spainm.php

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')


pd.set_option('display.max_rows', 12000)
pd.set_option('display.max_columns', 12000)
pd.set_option('display.width', 12000)
pd.set_option('display.max_colwidth', 12000)

In [2]:
FILE_PATH = os.path.realpath("../data/football_uk_data")

In [3]:
files = []
for file in os.listdir(FILE_PATH):
    files.append(os.path.join(FILE_PATH, file))

In [4]:
cols_to_keep = ["Div", "Date", "Time", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR", "HTHG", "HTAG", 
                "HTR", "HS", "AS", "HST", "AST", "HC", "AC", "HF", "AF", "HY","AY", "HR", "AR"]

In [5]:
match_data = pd.concat([pd.read_csv(f) for f in files], ignore_index = True)

In [6]:
match_data[cols_to_keep]

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,SP1,09/09/00,,Barcelona,Malaga,2.0,1.0,H,2.0,0.0,H,,,,,,,,,,,,
1,SP1,09/09/00,,La Coruna,Ath Bilbao,2.0,0.0,H,0.0,0.0,D,,,,,,,,,,,,
2,SP1,09/09/00,,Real Madrid,Valencia,2.0,1.0,H,0.0,0.0,D,,,,,,,,,,,,
3,SP1,09/09/00,,Sociedad,Santander,2.0,2.0,D,0.0,0.0,D,,,,,,,,,,,,
4,SP1,09/09/00,,Zaragoza,Espanol,1.0,2.0,A,0.0,0.0,D,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18522,SP2,29/05/2022,19:00,Leganes,Almeria,2.0,2.0,D,2.0,1.0,H,11.0,21.0,3.0,6.0,2.0,9.0,12.0,17.0,5.0,2.0,0.0,0.0
18523,SP2,29/05/2022,19:00,Oviedo,Ibiza,3.0,2.0,H,1.0,0.0,H,15.0,15.0,10.0,6.0,2.0,6.0,9.0,16.0,0.0,4.0,0.0,0.0
18524,SP2,29/05/2022,19:00,Sp Gijon,Las Palmas,0.0,1.0,A,0.0,1.0,A,4.0,11.0,1.0,4.0,1.0,6.0,19.0,11.0,2.0,0.0,0.0,0.0
18525,SP2,29/05/2022,19:00,Tenerife,Cartagena,1.0,2.0,A,0.0,2.0,A,15.0,6.0,6.0,5.0,4.0,1.0,17.0,16.0,5.0,1.0,0.0,0.0


In [7]:
# match_data[cols_to_keep].info()

In [8]:
# Update our datetime values
match_data['Date'] = pd.to_datetime(match_data['Date'])
match_data['Date'] = match_data['Date'].dt.strftime("%m/%d/%Y")
# match_data['Date'].head()

In [9]:
match_data['DateTime'] = pd.to_datetime(match_data['Date'] + ' ' + match_data['Time'])
match_data['Date'] = pd.to_datetime(match_data['Date'])

In [10]:
cols_to_keep = ["Div", "DateTime", "Date", "Time", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR", "HTHG", "HTAG", 
                "HTR", "HS", "AS", "HST", "AST", "HC", "AC", "HF", "AF", "HY","AY", "HR", "AR"]

In [11]:
match_data[cols_to_keep]

Unnamed: 0,Div,DateTime,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
0,SP1,NaT,2000-09-09,,Barcelona,Malaga,2.0,1.0,H,2.0,0.0,H,,,,,,,,,,,,
1,SP1,NaT,2000-09-09,,La Coruna,Ath Bilbao,2.0,0.0,H,0.0,0.0,D,,,,,,,,,,,,
2,SP1,NaT,2000-09-09,,Real Madrid,Valencia,2.0,1.0,H,0.0,0.0,D,,,,,,,,,,,,
3,SP1,NaT,2000-09-09,,Sociedad,Santander,2.0,2.0,D,0.0,0.0,D,,,,,,,,,,,,
4,SP1,NaT,2000-09-09,,Zaragoza,Espanol,1.0,2.0,A,0.0,0.0,D,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18522,SP2,2022-05-29 19:00:00,2022-05-29,19:00,Leganes,Almeria,2.0,2.0,D,2.0,1.0,H,11.0,21.0,3.0,6.0,2.0,9.0,12.0,17.0,5.0,2.0,0.0,0.0
18523,SP2,2022-05-29 19:00:00,2022-05-29,19:00,Oviedo,Ibiza,3.0,2.0,H,1.0,0.0,H,15.0,15.0,10.0,6.0,2.0,6.0,9.0,16.0,0.0,4.0,0.0,0.0
18524,SP2,2022-05-29 19:00:00,2022-05-29,19:00,Sp Gijon,Las Palmas,0.0,1.0,A,0.0,1.0,A,4.0,11.0,1.0,4.0,1.0,6.0,19.0,11.0,2.0,0.0,0.0,0.0
18525,SP2,2022-05-29 19:00:00,2022-05-29,19:00,Tenerife,Cartagena,1.0,2.0,A,0.0,2.0,A,15.0,6.0,6.0,5.0,4.0,1.0,17.0,16.0,5.0,1.0,0.0,0.0


In [12]:
match_data_clean = match_data[cols_to_keep]


In [13]:
match_data_clean.to_pickle(os.path.join(os.path.realpath("../data/preprocessed/football_uk_data_preprocess"), "match_data_clean.pkl"))