# Premier league data



First, we'll start exploring stats for a single match.

### Premier league match day stats

All match day results are available at: http://www.premierleague.com/en-gb/matchday/results.html

A match report is available on clicking the result of a match. From the match report page, it is possible to go to the stats page where all the stats we require are present.

Let's start exploring with Everton vs. Sunderland on 1st November 2015: http://www.premierleague.com/en-gb/matchday/matches/2015-2016/epl.match-stats.html/everton-vs-sunderland

In [1]:
import re
import locale
import requests
from bs4 import BeautifulSoup

locale.setlocale(locale.LC_ALL, 'English_United States.1252')

'English_United States.1252'

In [2]:
stats = requests.get('http://www.premierleague.com/en-gb/matchday/matches/2015-2016/epl.match-stats.html/everton-vs-sunderland')
stats = BeautifulSoup(stats.text, 'lxml')

**Date, Venue, Referee and Attendance**

In [3]:
import re
fis = stats.find('p', 'fixtureinfo').get_text().split('|')
fis = [re.sub(r'Referee: |Attendance ', '', i).strip() for i in fis]

# Convert attendance to integer
fis[-1] = locale.atoi(fis[-1])

# Add to data
data = dict(zip(['date', 'venue', 'referee', 'attendance'], fis))

print data

{'date': u'Sunday 1 November 2015', 'attendance': 36617, 'referee': u'Andre Marriner', 'venue': u'Goodison Park'}


**Home team, away team and score**

In [4]:
css_classes = ['home', 'away', 'countscore']
res = [stats.find('td', c).get_text().strip() for c in css_classes]

data['home_team'], data['away_team'], data['result'] = res[0], res[1], res[2]

In [5]:
home_goals = stats.find('span', 'homeScore').get_text().strip()
away_goals = stats.find('span', 'awayScore').get_text().strip()

data['home_goals'], data['away_goals'] = locale.atoi(home_goals), locale.atoi(away_goals)

print data

{'home_team': u'Everton', 'attendance': 36617, 'away_team': u'Sunderland', 'venue': u'Goodison Park', 'home_goals': 6, 'referee': u'Andre Marriner', 'result': u'6 - 2', 'away_goals': 2, 'date': u'Sunday 1 November 2015'}


**Home goals details, and away goals details**

In [6]:
for loc in ['home', 'away']:
    goals_details = [li.get_text().strip() for li in stats.find('div', '%s goals' % loc).find_all('li')]
    data['%s_goals_details' % loc] = ','.join(goals_details)

print data

{'home_team': u'Everton', 'attendance': 36617, 'home_goals_details': u'Gerard Deulofeu (19),Arouna Kon\xe9 (31, 62, 76),Sebasti\xe1n Coates (55 OG),Romelu Lukaku (60)', 'away_team': u'Sunderland', 'venue': u'Goodison Park', 'home_goals': 6, 'away_goals_details': u'Jermain Defoe (45+4),Steven Fletcher (50)', 'referee': u'Andre Marriner', 'result': u'6 - 2', 'away_goals': 2, 'date': u'Sunday 1 November 2015'}


**Stats!**

In [7]:
tables = stats.find_all('div', 'statsTable')

In [8]:
for table in tables:
    t = table.find('table')
    h = t.select('thead th')[1:]
    d = [r.select('td') for r in t.select('tbody tr')]
    for i in zip(h, *d):
        metric = i[0].get_text().strip().lower().replace(' ', '_')
        data['%s_home_team' % metric] = locale.atoi(i[1].get_text().strip())
        data['%s_away_team' % metric] = locale.atoi(i[2].get_text().strip())
        
print data

{u'clearances_away_team': 26, 'away_goals': 2, u'fouls_away_team': 5, u'throw_ins_away_team': 17, u'shots_on_target_home_team': 8, u'shots_on_target_away_team': 10, 'away_goals_details': u'Jermain Defoe (45+4),Steven Fletcher (50)', 'result': u'6 - 2', u'assists_away_team': 2, u'offsides_away_team': 2, 'attendance': 36617, u'yellow_cards_away_team': 2, 'home_goals_details': u'Gerard Deulofeu (19),Arouna Kon\xe9 (31, 62, 76),Sebasti\xe1n Coates (55 OG),Romelu Lukaku (60)', u'penalties_away_team': 0, u'crosses_away_team': 13, u'shots_off_target_away_team': 3, u'clearances_home_team': 21, u'throw_ins_home_team': 17, u'handballs_home_team': 0, u'red_cards_home_team': 0, 'home_goals': 6, 'referee': u'Andre Marriner', u'yellow_cards_home_team': 3, u'total_shots_away_team': 13, u'fouls_home_team': 6, u'free_kicks_home_team': 7, 'away_team': u'Sunderland', 'date': u'Sunday 1 November 2015', u'free_kicks_away_team': 9, u'saves_home_team': 7, u'corners_away_team': 1, u'offsides_home_team': 3, 'h

### Let's convert what we've done so far into a function...

In [9]:
def get_stats_for_match(match_url):
    """
    Given a match stat url, from premierleague.com,
    returns a stats dict with all match statistics
    """
    soup = BeautifulSoup(requests.get(match_url).text, 'lxml')

    # get managers info
    mgrs = [
        soup.select('div .teamtitle .homecol')[0].find_all('a')[-1],
        soup.select('div .teamtitle .awaycol')[0].find_all('a')[-1]
    ]

    match_stat_url = match_url.replace('match-report.html', 'match-stats.html')

    soup = BeautifulSoup(requests.get(match_stat_url).text, 'lxml')

    # get fixture infos
    fis = soup.find('p', 'fixtureinfo').get_text().split('|')
    fis = [re.sub(r'Referee: |Attendance ', '', i).strip() for i in fis]

    # convert attendance to integer
    fis[-1] = locale.atoi(fis[-1])

    # add to stats
    stats = dict(zip(['date', 'venue', 'referee', 'attendance'], fis))
    stats['home_manager'] = mgrs[0].get_text().strip()
    stats['away_manager'] = mgrs[1].get_text().strip()

    # get home, away and goals
    css_classes = ['home', 'away', 'countscore']
    res = [soup.find('td', c).get_text().strip() for c in css_classes]

    stats['home_team'], stats['away_team'], stats['result'] = res[0], res[1], res[2]

    for loc in ['home', 'away']:
        goals = soup.find('span', '%sScore' % loc).get_text().strip()
        goals_details = [
            li.get_text().strip()
            for li in soup.find('div', '%s goals' % loc).find_all('li')
        ]
        stats['%s_goals' % loc] = locale.atoi(goals)
        stats['%s_goals_details' % loc] = ','.join(goals_details)

    # in depth stats table
    tables = soup.find_all('div', 'statsTable')
    for table in tables:
        t = table.find('table')
        h = t.select('thead th')[1:]
        d = [r.select('td') for r in t.select('tbody tr')]
        for i in zip(h, *d):
            metric = i[0].get_text().strip().lower().replace(' ', '_')
            stats['%s_home_team' % metric] = locale.atoi(i[1].get_text().strip())
            stats['%s_away_team' % metric] = locale.atoi(i[2].get_text().strip())

    return stats

# Function test

In [16]:
get_stats_for_match('http://www.premierleague.com/en-gb/matchday/matches/2015-2016/epl.match-report.html/liverpool-vs-spurs')

{u'assists_away_team': 1,
 u'assists_home_team': 1,
 'attendance': 44062,
 'away_goals': 1,
 'away_goals_details': u'Harry Kane (63)',
 'away_manager': u'Mauricio Pochettino',
 'away_team': u'Spurs',
 u'blocks_away_team': 6,
 u'blocks_home_team': 6,
 u'clearances_away_team': 28,
 u'clearances_home_team': 28,
 u'corners_away_team': 9,
 u'corners_home_team': 12,
 u'crosses_away_team': 10,
 u'crosses_home_team': 13,
 'date': u'Saturday 2 April 2016',
 u'fouls_away_team': 11,
 u'fouls_home_team': 11,
 u'free_kicks_away_team': 10,
 u'free_kicks_home_team': 12,
 u'handballs_away_team': 0,
 u'handballs_home_team': 0,
 'home_goals': 1,
 'home_goals_details': u'Philippe Coutinho (51)',
 'home_manager': u'J\xfcrgen Klopp',
 'home_team': u'Liverpool',
 u'offsides_away_team': 1,
 u'offsides_home_team': 0,
 u'penalties_away_team': 0,
 u'penalties_home_team': 0,
 u'red_cards_away_team': 0,
 u'red_cards_home_team': 0,
 'referee': u'Jonathan Moss',
 'result': u'1 - 1',
 u'saves_away_team': 7,
 u'saves