In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

<a id='top'></a>
# European Match Stats & Odds DB Creation

## Contents
 - [Imports](#imports)
 - [Parsing From Data Source](#parsing)
 - [Clean and Cast Data](#data_cleaning)
 - [Create Local Sqlite DB](#create_db)

There are various data sources for football match data - the best dataset I have found is [here](https://www.kaggle.com/hugomathien/soccer) on Kaggle - it has:
 - Match result data for all European Leagues
 - Odds data for matches
 - Intra match stats (time of goal, foul, corner etc)
 - Player and Fifa data
 
It's all conveniently available in a sqlite database, however it is 4 years out of date (stops on 2016) and some of the original data sources that the authoer compiled are no longer available

As a result I want to create as complete a dataset as possible that is up to date

My main focus is simple stats visualisation and a clean dataset to act as a playground for prediction

The main underlying data provider for the above is [football-data](https://www.football-data.co.uk/) - a simple but clean website that has most of the data I'm after

Unfortunately we miss out on the intra match stats (exact times), but we still get the half time goal snapshot which is sufficient for me

As a result the aim is to compile a clean and consistent db of that data (probably sqlite) using python

## 1. Imports

[Back to Top](#top)

In [2]:
# mixture of libs for web scraping, parsing and pandas
from bs4 import BeautifulSoup
import datetime as dt
import io
import json
import numpy as np
import pandas as pd
import requests
import seaborn as sns
import sqlite3
from urllib.request import Request, urlopen
import warnings

pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

<a id="parsing"></a>
## 2. Parse Website to get Data Links

[Back to Top](#top)

First, we will parse the football-data website to find the links to the underlying csvs and order them as follows:
 - A dictionary of country and link to the data page
 - A dictionary of country to dictionary of {season: csv link}
 
Once we have all of these we can then create the same data structure but with csvs loaded into pandas

Then we can combine the dataframes based on the shared columns, decide how to deal with nans and then store in a database format (or flat file)

### 2a. Define the Country Dictionary

In [3]:
# define the site root
SITE_ROOT = 'https://www.football-data.co.uk/'
DATA_ROOT = 'https://www.football-data.co.uk/data.php'

In [4]:
# get the page and parse into soup object
req = Request(DATA_ROOT)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")

In [5]:
# get all the links on the data page
links = []
for link in soup.findAll('a'):
    l = link.get('href')
    if l != None:
        links.append(l)

In [6]:
# now we need to get the list of links that link to pages for data
# this involves parsing the page for the country name
# from inspection these pages end '*m.php'

# get links that end '*m.php' and are not https links
countries = [x[:-5] for x in links if x[-5:] == 'm.php' and 'https' not in x]

In [7]:
'Countries where we have the data: {}'.format(', '.join(countries))

'Countries where we have the data: england, scotland, germany, italy, spain, france, netherlands, belgium, portugal, turkey, greece'

In [8]:
# form the data links and then zip into a dictionary
country_links = [SITE_ROOT+x+'m.php' for x in countries]
country_dict = dict(zip(countries, country_links))
country_dict

{'england': 'https://www.football-data.co.uk/englandm.php',
 'scotland': 'https://www.football-data.co.uk/scotlandm.php',
 'germany': 'https://www.football-data.co.uk/germanym.php',
 'italy': 'https://www.football-data.co.uk/italym.php',
 'spain': 'https://www.football-data.co.uk/spainm.php',
 'france': 'https://www.football-data.co.uk/francem.php',
 'netherlands': 'https://www.football-data.co.uk/netherlandsm.php',
 'belgium': 'https://www.football-data.co.uk/belgiumm.php',
 'portugal': 'https://www.football-data.co.uk/portugalm.php',
 'turkey': 'https://www.football-data.co.uk/turkeym.php',
 'greece': 'https://www.football-data.co.uk/greecem.php'}

### 2b. Define the Individual CSVs

Now we want to scrape the individual country pages to get the csv links, then order them by season and league

In [9]:
all_links = {}

for country,link in country_dict.items():
    # get the page and parse into soup object
    req = Request(link)
    html_page = urlopen(req)
    soup = BeautifulSoup(html_page, "lxml")
    
    # get all the links on the data page
    csv_links = []
    for url_link in soup.findAll('a'):
        # get the label e.g. 'Premier League'
        label = url_link.contents[0]
        # get the link ref e.g. 'mmz4281/2021/E0.csv'
        l = url_link.get('href')
        # if link not null and is a csv then add it
        if l != None and '.csv' in l:
            csv_links.append([label, l])
            
    all_links[country] = csv_links

Now we can query them and store them in dataframe objects to be joined

In [11]:
output_dfs = []

# for each season / league we have a link for
for country, links in all_links.items():
    for s in links:
        # form the query url
        query_url = SITE_ROOT + s[1]
        # format the season e.g. '19/20' into '2019/2020'
        season = s[1].split("/")[-2]
        if (2000+int(season[:2])) > dt.date.today().year:
            season = '19' + season[:2] + '/' + '19' + season[-2:]
        else:
            season = '20' + season[:2] + '/' + '20' + season[-2:]
        print("Querying url: {} for country: {}, league: {} and season: {}".format(query_url, country, s[0], season))
        # query it
        res = requests.get(query_url)
        # if good response, extract
        if res.status_code == 200:
            output = res.content
            df = pd.read_csv(io.StringIO(output.decode('utf-8', errors='ignore')), parse_dates=True, error_bad_lines=False, warn_bad_lines=False)
            # add columns that define the season and league
            df['Country'] = country
            df['League'] = s[0]
            df['Season'] = season
            # add to the list of output dfs
            output_dfs.append(df)

Querying url: https://www.football-data.co.uk/mmz4281/2021/E0.csv for league: Premier League and season: 2020/2021
Querying url: https://www.football-data.co.uk/mmz4281/2021/E1.csv for league: Championship and season: 2020/2021
Querying url: https://www.football-data.co.uk/mmz4281/2021/E2.csv for league: League 1 and season: 2020/2021
Querying url: https://www.football-data.co.uk/mmz4281/2021/E3.csv for league: League 2 and season: 2020/2021
Querying url: https://www.football-data.co.uk/mmz4281/1920/E0.csv for league: Premier League and season: 2019/2020
Querying url: https://www.football-data.co.uk/mmz4281/1920/E1.csv for league: Championship and season: 2019/2020
Querying url: https://www.football-data.co.uk/mmz4281/1920/E2.csv for league: League 1 and season: 2019/2020
Querying url: https://www.football-data.co.uk/mmz4281/1920/E3.csv for league: League 2 and season: 2019/2020
Querying url: https://www.football-data.co.uk/mmz4281/1920/EC.csv for league: Conference and season: 2019/20

In [12]:
# concat the resulting dataframes together
output = pd.concat(output_dfs)

In [13]:
output.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,PSH,PSD,PSA,WHH,WHD,WHA,VCH,VCD,VCA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA,B365>2.5,B365<2.5,P>2.5,P<2.5,Max>2.5,Max<2.5,Avg>2.5,Avg<2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA,B365CH,B365CD,B365CA,BWCH,BWCD,BWCA,IWCH,IWCD,IWCA,PSCH,PSCD,PSCA,WHCH,WHCD,WHCA,VCCH,VCCD,VCCA,MaxCH,MaxCD,MaxCA,AvgCH,AvgCD,AvgCA,B365C>2.5,B365C<2.5,PC>2.5,PC<2.5,MaxC>2.5,MaxC<2.5,AvgC>2.5,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,country,league,season,Bb1X2,BbMxH,BbAvH,BbMxD,BbAvD,BbMxA,BbAvA,BbOU,BbMx>2.5,BbAv>2.5,BbMx<2.5,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,LBH,LBD,LBA,SJH,SJD,SJA,GBH,GBD,GBA,BSH,BSD,BSA,SBH,SBD,SBA,GB>2.5,GB<2.5,GBAHH,GBAHA,GBAH,LBAHH,LBAHA,LBAH,B365AH,SOH,SOD,SOA,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Attendance,HHW,AHW,HO,AO,HBP,ABP,SYH,SYD,SYA,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 53,Unnamed: 54,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 61,Unnamed: 62,LB,LB.1,LB.2,Unnamed: 30,HFKC,AFKC,HT,AT,Unnamed: 28,Unnamed: 29
0,E0,12/09/2020,12:30,Fulham,Arsenal,0.0,3.0,A,0.0,1.0,A,C Kavanagh,5.0,13.0,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0,6.0,4.33,1.53,5.5,4.25,1.57,6.0,3.9,1.57,6.16,4.51,1.56,6.5,4.2,1.53,6.5,4.2,1.55,6.55,4.55,1.6,5.94,4.34,1.55,1.72,2.1,1.8,2.13,1.84,2.18,1.76,2.1,1.0,1.93,1.97,1.96,1.96,2.0,1.99,1.93,1.95,5.0,4.0,1.66,5.5,4.0,1.62,5.25,3.9,1.67,5.48,3.98,1.69,5.5,3.8,1.65,5.5,3.9,1.67,5.75,4.2,1.71,5.36,3.93,1.67,2.0,1.8,2.06,1.86,2.1,1.92,2.0,1.84,0.75,2.01,1.89,2.02,1.91,2.13,1.92,2.02,1.87,england,Premier League,2020/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,E0,12/09/2020,15:00,Crystal Palace,Southampton,1.0,0.0,H,1.0,0.0,H,Jj Moss,5.0,9.0,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0,3.1,3.25,2.37,3.0,3.2,2.45,3.15,2.95,2.4,3.32,3.29,2.4,3.2,3.2,2.35,3.2,3.2,2.4,3.36,3.36,2.5,3.18,3.22,2.39,2.2,1.66,2.34,1.68,2.36,1.73,2.24,1.67,0.25,1.85,2.05,1.88,2.05,1.88,2.07,1.84,2.03,3.0,3.25,2.4,3.0,3.3,2.4,3.05,2.9,2.45,3.09,3.27,2.54,3.1,3.1,2.45,3.1,3.25,2.45,3.25,3.33,2.55,3.08,3.22,2.47,2.2,1.66,2.26,1.72,2.27,1.78,2.18,1.7,0.25,1.78,2.13,1.79,2.17,1.85,2.18,1.79,2.12,england,Premier League,2020/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,E0,12/09/2020,17:30,Liverpool,Leeds,4.0,3.0,H,3.0,2.0,H,M Oliver,22.0,6.0,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0,1.28,6.0,9.5,1.26,6.25,10.5,1.35,5.0,8.5,1.31,6.25,9.92,1.27,6.0,10.0,1.3,5.75,10.5,1.35,6.5,10.75,1.3,5.96,9.68,1.53,2.5,1.56,2.6,1.56,2.68,1.52,2.53,-1.5,1.95,1.95,1.97,1.95,2.0,2.08,1.9,1.97,1.25,6.0,11.0,1.25,6.25,11.0,1.3,6.0,9.0,1.28,6.34,11.38,1.25,6.0,12.0,1.29,6.0,11.5,1.3,6.75,12.27,1.28,6.16,10.63,1.5,2.62,1.51,2.76,1.53,2.82,1.5,2.62,-1.5,1.85,2.05,1.85,2.08,1.9,2.16,1.84,2.04,england,Premier League,2020/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,E0,12/09/2020,20:00,West Ham,Newcastle,0.0,2.0,A,0.0,0.0,D,S Attwell,15.0,15.0,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0,2.15,3.4,3.4,2.15,3.4,3.4,2.15,3.15,3.4,2.18,3.61,3.5,2.15,3.5,3.4,2.15,3.4,3.6,2.24,3.7,3.6,2.15,3.48,3.42,1.9,1.9,2.0,1.91,2.05,1.95,1.97,1.86,-0.5,2.07,1.72,2.17,1.78,2.17,1.81,2.12,1.75,1.95,3.6,3.75,1.95,3.7,3.75,2.05,3.25,3.75,2.04,3.59,3.92,2.0,3.5,3.8,2.0,3.5,3.9,2.07,3.78,3.99,2.01,3.57,3.79,1.9,1.9,2.0,1.92,2.0,2.05,1.91,1.92,-0.5,2.03,1.87,2.04,1.88,2.09,1.91,2.02,1.86,england,Premier League,2020/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,E0,13/09/2020,14:00,West Brom,Leicester,0.0,3.0,A,0.0,0.0,D,A Taylor,7.0,13.0,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0,3.8,3.6,1.95,3.7,3.6,2.0,3.85,3.2,2.0,4.0,3.59,2.0,3.8,3.6,1.95,4.0,3.5,1.95,4.0,3.82,2.04,3.87,3.57,1.97,1.9,1.9,2.0,1.91,2.02,2.03,1.92,1.9,0.5,1.91,1.99,1.92,2.0,1.93,2.02,1.88,1.97,3.25,3.4,2.2,3.3,3.4,2.2,3.35,3.0,2.3,3.38,3.38,2.32,3.3,3.3,2.25,3.3,3.3,2.3,3.55,3.5,2.38,3.32,3.33,2.28,2.2,1.66,2.23,1.74,2.28,1.82,2.15,1.73,0.25,1.92,1.98,1.93,1.99,1.95,2.01,1.91,1.97,england,Premier League,2020/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


<a id='data_cleaning'></a>
## 3. Data Checking and Cleaning

[Back to Top](#top)

Now we have data on around 200k matches we can:
 - Sense check at least some of the major league data
 - Check the column types and data to see if any columns are useless e.g. too incomplete

[Notes](https://www.football-data.co.uk/notes.txt) are provided for the columns on the site where we pull the data from

### 3a. Cleaning

First, we get rid of any column where more than 99% of the values are nulls

In [14]:
df = output[output.columns[output.isnull().mean() < 0.99]]

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 208687 entries, 0 to 551
Columns: 164 entries, Div to SYA
dtypes: float64(151), object(13)
memory usage: 262.7+ MB


This removes parsing errors ('unnamed: 43' etc) and columns where very little data is available

In [16]:
df.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,PSH,PSD,PSA,WHH,WHD,WHA,VCH,VCD,VCA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA,B365>2.5,B365<2.5,P>2.5,P<2.5,Max>2.5,Max<2.5,Avg>2.5,Avg<2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA,B365CH,B365CD,B365CA,BWCH,BWCD,BWCA,IWCH,IWCD,IWCA,PSCH,PSCD,PSCA,WHCH,WHCD,WHCA,VCCH,VCCD,VCCA,MaxCH,MaxCD,MaxCA,AvgCH,AvgCD,AvgCA,B365C>2.5,B365C<2.5,PC>2.5,PC<2.5,MaxC>2.5,MaxC<2.5,AvgC>2.5,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,country,league,season,Bb1X2,BbMxH,BbAvH,BbMxD,BbAvD,BbMxA,BbAvA,BbOU,BbMx>2.5,BbAv>2.5,BbMx<2.5,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,LBH,LBD,LBA,SJH,SJD,SJA,GBH,GBD,GBA,BSH,BSD,BSA,SBH,SBD,SBA,GB>2.5,GB<2.5,GBAHH,GBAHA,GBAH,LBAHH,LBAHA,LBAH,B365AH,SOH,SOD,SOA,Attendance,HHW,AHW,HO,AO,HBP,ABP,SYH,SYD,SYA
0,E0,12/09/2020,12:30,Fulham,Arsenal,0.0,3.0,A,0.0,1.0,A,C Kavanagh,5.0,13.0,2.0,6.0,12.0,12.0,2.0,3.0,2.0,2.0,0.0,0.0,6.0,4.33,1.53,5.5,4.25,1.57,6.0,3.9,1.57,6.16,4.51,1.56,6.5,4.2,1.53,6.5,4.2,1.55,6.55,4.55,1.6,5.94,4.34,1.55,1.72,2.1,1.8,2.13,1.84,2.18,1.76,2.1,1.0,1.93,1.97,1.96,1.96,2.0,1.99,1.93,1.95,5.0,4.0,1.66,5.5,4.0,1.62,5.25,3.9,1.67,5.48,3.98,1.69,5.5,3.8,1.65,5.5,3.9,1.67,5.75,4.2,1.71,5.36,3.93,1.67,2.0,1.8,2.06,1.86,2.1,1.92,2.0,1.84,0.75,2.01,1.89,2.02,1.91,2.13,1.92,2.02,1.87,england,Premier League,2020/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,E0,12/09/2020,15:00,Crystal Palace,Southampton,1.0,0.0,H,1.0,0.0,H,Jj Moss,5.0,9.0,3.0,5.0,14.0,11.0,7.0,3.0,2.0,1.0,0.0,0.0,3.1,3.25,2.37,3.0,3.2,2.45,3.15,2.95,2.4,3.32,3.29,2.4,3.2,3.2,2.35,3.2,3.2,2.4,3.36,3.36,2.5,3.18,3.22,2.39,2.2,1.66,2.34,1.68,2.36,1.73,2.24,1.67,0.25,1.85,2.05,1.88,2.05,1.88,2.07,1.84,2.03,3.0,3.25,2.4,3.0,3.3,2.4,3.05,2.9,2.45,3.09,3.27,2.54,3.1,3.1,2.45,3.1,3.25,2.45,3.25,3.33,2.55,3.08,3.22,2.47,2.2,1.66,2.26,1.72,2.27,1.78,2.18,1.7,0.25,1.78,2.13,1.79,2.17,1.85,2.18,1.79,2.12,england,Premier League,2020/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,E0,12/09/2020,17:30,Liverpool,Leeds,4.0,3.0,H,3.0,2.0,H,M Oliver,22.0,6.0,6.0,3.0,9.0,6.0,9.0,0.0,1.0,0.0,0.0,0.0,1.28,6.0,9.5,1.26,6.25,10.5,1.35,5.0,8.5,1.31,6.25,9.92,1.27,6.0,10.0,1.3,5.75,10.5,1.35,6.5,10.75,1.3,5.96,9.68,1.53,2.5,1.56,2.6,1.56,2.68,1.52,2.53,-1.5,1.95,1.95,1.97,1.95,2.0,2.08,1.9,1.97,1.25,6.0,11.0,1.25,6.25,11.0,1.3,6.0,9.0,1.28,6.34,11.38,1.25,6.0,12.0,1.29,6.0,11.5,1.3,6.75,12.27,1.28,6.16,10.63,1.5,2.62,1.51,2.76,1.53,2.82,1.5,2.62,-1.5,1.85,2.05,1.85,2.08,1.9,2.16,1.84,2.04,england,Premier League,2020/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,E0,12/09/2020,20:00,West Ham,Newcastle,0.0,2.0,A,0.0,0.0,D,S Attwell,15.0,15.0,3.0,2.0,13.0,7.0,8.0,7.0,2.0,2.0,0.0,0.0,2.15,3.4,3.4,2.15,3.4,3.4,2.15,3.15,3.4,2.18,3.61,3.5,2.15,3.5,3.4,2.15,3.4,3.6,2.24,3.7,3.6,2.15,3.48,3.42,1.9,1.9,2.0,1.91,2.05,1.95,1.97,1.86,-0.5,2.07,1.72,2.17,1.78,2.17,1.81,2.12,1.75,1.95,3.6,3.75,1.95,3.7,3.75,2.05,3.25,3.75,2.04,3.59,3.92,2.0,3.5,3.8,2.0,3.5,3.9,2.07,3.78,3.99,2.01,3.57,3.79,1.9,1.9,2.0,1.92,2.0,2.05,1.91,1.92,-0.5,2.03,1.87,2.04,1.88,2.09,1.91,2.02,1.86,england,Premier League,2020/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,E0,13/09/2020,14:00,West Brom,Leicester,0.0,3.0,A,0.0,0.0,D,A Taylor,7.0,13.0,1.0,7.0,12.0,9.0,2.0,5.0,1.0,1.0,0.0,0.0,3.8,3.6,1.95,3.7,3.6,2.0,3.85,3.2,2.0,4.0,3.59,2.0,3.8,3.6,1.95,4.0,3.5,1.95,4.0,3.82,2.04,3.87,3.57,1.97,1.9,1.9,2.0,1.91,2.02,2.03,1.92,1.9,0.5,1.91,1.99,1.92,2.0,1.93,2.02,1.88,1.97,3.25,3.4,2.2,3.3,3.4,2.2,3.35,3.0,2.3,3.38,3.38,2.32,3.3,3.3,2.25,3.3,3.3,2.3,3.55,3.5,2.38,3.32,3.33,2.28,2.2,1.66,2.23,1.74,2.28,1.82,2.15,1.73,0.25,1.92,1.98,1.93,1.99,1.95,2.01,1.91,1.97,england,Premier League,2020/2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


If we have zero data for the div then we have a parsing error and so we should remove those rows

In [17]:
df[df.Div.isna()][['Country', 'League', 'Season', 'Date']].fillna(0).groupby(['Country', 'League', 'Season']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Date
country,league,season,Unnamed: 3_level_1
belgium,Jupiler League,1995/1996,246
belgium,Jupiler League,1996/1997,246
belgium,Jupiler League,1998/1999,246
belgium,Jupiler League,1999/1900,246
england,Championship,2014/2015,1
...,...,...,...
turkey,Futbol Ligi 1,1999/1900,246
turkey,Futbol Ligi 1,2012/2013,1
turkey,Futbol Ligi 1,2013/2014,2
turkey,Futbol Ligi 1,2014/2015,1


In [18]:
df = df[~df['Div'].isna()]

### 3b. Data Types

Now we check the columns for the data types - we can only do this in chunks of 100 cols max

In [19]:
df.iloc[:,:60].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188776 entries, 0 to 305
Data columns (total 60 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   Div       188776 non-null  object 
 1   Date      188776 non-null  object 
 2   Time      7470 non-null    object 
 3   HomeTeam  187730 non-null  object 
 4   AwayTeam  187730 non-null  object 
 5   FTHG      188774 non-null  float64
 6   FTAG      188774 non-null  float64
 7   FTR       188774 non-null  object 
 8   HTHG      164747 non-null  float64
 9   HTAG      164747 non-null  float64
 10  HTR       164747 non-null  object 
 11  Referee   53902 non-null   object 
 12  HS        84687 non-null   float64
 13  AS        84690 non-null   float64
 14  HST       83881 non-null   float64
 15  AST       83884 non-null   float64
 16  HF        82593 non-null   float64
 17  AF        82593 non-null   float64
 18  HC        84315 non-null   float64
 19  AC        84315 non-null   float64
 20  HY     

Looks pretty good - most stuff are floats as they are either stats or odds

Just need to change the date to a datetime object

In [20]:
def standardise_dates(d):
    
    if len(d) == len('01/02/2000'):
        return pd.to_datetime(d, format='%d/%m/%Y')
    elif len(d) == len('01/02/20'):
        return pd.to_datetime(d, format='%d/%m/%y')
    else:
        return pd.to_datetime(d)

In [21]:
df['Date'] = df.Date.apply(lambda x: standardise_dates(x))

In [22]:
df['PSCH'] = pd.to_numeric(df['PSCH'])

In [23]:
df.iloc[:,120:].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188776 entries, 0 to 305
Data columns (total 44 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   BbAv<2.5    108552 non-null  float64
 1   BbAH        108311 non-null  float64
 2   BbAHh       108310 non-null  object 
 3   BbMxAHH     108310 non-null  float64
 4   BbAvAHH     108310 non-null  float64
 5   BbMxAHA     108308 non-null  float64
 6   BbAvAHA     108308 non-null  float64
 7   LBH         119307 non-null  float64
 8   LBD         119307 non-null  float64
 9   LBA         119310 non-null  float64
 10  SJH         70252 non-null   float64
 11  SJD         70252 non-null   float64
 12  SJA         70252 non-null   float64
 13  GBH         88006 non-null   float64
 14  GBD         88006 non-null   float64
 15  GBA         88006 non-null   float64
 16  BSH         46096 non-null   float64
 17  BSD         46097 non-null   float64
 18  BSA         46096 non-null   float64
 19  SBH  

<a id='create_db'></a>
## 4. Set Data Into Sqlite Database

[Back to Top](#top)

Now we have scraped our data and done some rough data type checking we can set it down as a table

We could just use a flat file but if we want to use quick querying to prevent having to load the whole file into memory each time, then we should use a database structure

In [24]:
conn = sqlite3.connect('../data/match_results.sqlite')

In [25]:
df.to_sql('matches', conn, index=False)

In [26]:
conn.close()

### 4a. Quick Test

We connect and check if we can:
 - Get the data for the EPL for the 2019-2020 season
 - Check the GF and GA
 - Create an end of season league table

In [27]:
conn = sqlite3.connect('../data/match_results.sqlite')

Now we have an open connection, then we check what tables are in the db - we expect one table called 'matches'

In [28]:
tables = pd.read_sql("""SELECT *
                        FROM sqlite_master
                        WHERE type='table';""", conn)
tables

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,matches,matches,2,"CREATE TABLE ""matches"" (\n""Div"" TEXT,\n ""Date..."


Now we can structure a SQL query to get just the data for the EPL for 2019/2020 season

In [29]:
query =  """SELECT HomeTeam, AwayTeam, FTHG, FTAG, FTR
            FROM matches
            WHERE Season='2019/2020' AND Div='E0' """

In [30]:
%timeit df_epl = pd.read_sql(query, conn)

136 ms ± 36.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [32]:
df_epl = pd.read_sql(query, conn)

Good start - we have 380 matches (20 teams play each other twice, each match has 2 teams --> 20 * 19 * 2 / 2)

The query takes an average of 106ms so is fast enough for on the fly querying (if we add another 200ms) for data processing then we're good enough

First let's reconstruct the results table - we need to do one for home and away then join

In [33]:
df_epl

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR
0,Liverpool,Norwich,4.0,1.0,H
1,West Ham,Man City,0.0,5.0,A
2,Bournemouth,Sheffield United,1.0,1.0,D
3,Burnley,Southampton,3.0,0.0,H
4,Crystal Palace,Everton,0.0,0.0,D
...,...,...,...,...,...
375,Leicester,Man United,0.0,2.0,A
376,Man City,Norwich,5.0,0.0,H
377,Newcastle,Liverpool,1.0,3.0,A
378,Southampton,Sheffield United,3.0,1.0,H


In [34]:
df_home_epl = df_epl.groupby(['HomeTeam', 'FTR']).agg({'AwayTeam': 'count', 'FTHG': 'sum', 'FTAG': 'sum'})

In [35]:
home_res = {'H': 3, 'A': 0, 'D': 1}
away_res = {'H': 0, 'A': 3, 'D': 1}

In [36]:
df_epl['HomePoints'] = df_epl['FTR'].apply(lambda x: home_res[x])
df_epl['AwayPoints'] = df_epl['FTR'].apply(lambda x: away_res[x])
df_epl

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HomePoints,AwayPoints
0,Liverpool,Norwich,4.0,1.0,H,3,0
1,West Ham,Man City,0.0,5.0,A,0,3
2,Bournemouth,Sheffield United,1.0,1.0,D,1,1
3,Burnley,Southampton,3.0,0.0,H,3,0
4,Crystal Palace,Everton,0.0,0.0,D,1,1
...,...,...,...,...,...,...,...
375,Leicester,Man United,0.0,2.0,A,0,3
376,Man City,Norwich,5.0,0.0,H,3,0
377,Newcastle,Liverpool,1.0,3.0,A,0,3
378,Southampton,Sheffield United,3.0,1.0,H,3,0


Results for teams when playing at home

In [37]:
df_epl_home = df_epl[['HomeTeam', 'FTHG', 'FTAG', 'HomePoints']].groupby('HomeTeam').sum().sort_values('HomePoints', ascending=False)
df_epl_home = df_epl_home.reset_index().rename(columns={'HomeTeam': 'Team', 'FTHG': 'GF', 'FTAG': 'GA', 'HomePoints': 'Points'}).set_index('Team')
df_epl_home.head()

Unnamed: 0_level_0,GF,GA,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Liverpool,52.0,16.0,55
Man City,57.0,13.0,47
Tottenham,36.0,17.0,39
Leicester,35.0,17.0,37
Man United,40.0,17.0,37


Results for teams when playing at away

In [38]:
df_epl_away = df_epl[['AwayTeam', 'FTHG', 'FTAG', 'AwayPoints']].groupby('AwayTeam').sum().sort_values('AwayPoints', ascending=False)
df_epl_away = df_epl_away.reset_index().rename(columns={'AwayTeam': 'Team', 'FTHG': 'GA', 'FTAG': 'GF', 'AwayPoints': 'Points'}).set_index('Team')
df_epl_away.head()

Unnamed: 0_level_0,GA,GF,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Liverpool,17.0,33.0,44
Man City,22.0,45.0,34
Southampton,25.0,30.0,31
Chelsea,38.0,39.0,30
Man United,19.0,26.0,29


Final League Table for the EPL 2019/2020

[From Google](https://www.google.com/search?q=2019+2020+league+table+EPL&oq=2019+2020+league+table+EPL&aqs=chrome..69i57j0.4232j0j1&sourceid=chrome&ie=UTF-8#sie=lg;/g/11fj6snmjm;2;/m/02_tc;st;fp;1;;)

In [39]:
(df_epl_home + df_epl_away).sort_values('Points', ascending=False)

Unnamed: 0_level_0,GA,GF,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Liverpool,33.0,85.0,99
Man City,35.0,102.0,81
Man United,36.0,66.0,66
Chelsea,54.0,69.0,66
Leicester,41.0,67.0,62
Tottenham,47.0,61.0,59
Wolves,40.0,51.0,59
Arsenal,48.0,56.0,56
Burnley,50.0,43.0,54
Sheffield United,39.0,39.0,54
