In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# default_exp data.sportde

In [None]:
# export
from betting.scraping import *
import re
from datetime import datetime
import pandas as pd

In [None]:
from pathlib import Path
import os

# Sport.de
> Standings, points, matchday information

Calculating points and standings from raw (and noisy) match data seems to be too prone to error. It's not rare that there are missing matches in 'complete' datasets. Therefore, accurate information about points, standings and matchdays is needed. Thankfully, sports.de offers a clean html representation of such data that is easily parseable.

In [None]:
# export
BASE_URL = 'https://www.sport.de'

In [None]:
cache_prefix = 'primera_division'
league_example_url = BASE_URL + '/fussball/spanien-primera-division/se18343/2015-2016/ergebnisse-und-tabelle/'

league_example_bs = cache(league_example_url, f'league_example_{cache_prefix}')

## Season links

From any result page for a league, links to all the seasons can be extracted.

In [None]:
# export
def season_links_from_page(bs):
    select_element = bs.find_all('select', {'class': 'navigation season-navigation'})[0]
    seasons_urls = {}

    for option in select_element.find_all('option'):
        match = re.match(r'''/fussball/[^/]+/.+/(\d\d\d\d-\d\d\d\d)/ergebnisse-und-tabelle/''', option['value'])
        if match is None:
            print('Unexpected season link: ', option['value'])
            continue
        url = match.group(0)
        season = match.group(1)
        if int(season[:4])<1950: continue
        season = season[2] + season[3] + season[7] + season[8]
        seasons_urls[season] = BASE_URL + url

    return seasons_urls

In [None]:
season_urls = season_links_from_page(league_example_bs)
season_urls

Unexpected season link:  /fussball/spanien-primera-division/se3061/1986-1987-vorrunde/ergebnisse-und-tabelle/
Unexpected season link:  /fussball/spanien-primera-division/se3120/1986-1987-playoff-13-18/ergebnisse-und-tabelle/
Unexpected season link:  /fussball/spanien-primera-division/se3119/1986-1987-playoff-7-12/ergebnisse-und-tabelle/
Unexpected season link:  /fussball/spanien-primera-division/se3118/1986-1987-playoff-1-6/ergebnisse-und-tabelle/


{'2021': 'https://www.sport.de/fussball/spanien-primera-division/se35880/2020-2021/ergebnisse-und-tabelle/',
 '1920': 'https://www.sport.de/fussball/spanien-primera-division/se31742/2019-2020/ergebnisse-und-tabelle/',
 '1819': 'https://www.sport.de/fussball/spanien-primera-division/se28562/2018-2019/ergebnisse-und-tabelle/',
 '1718': 'https://www.sport.de/fussball/spanien-primera-division/se23902/2017-2018/ergebnisse-und-tabelle/',
 '1617': 'https://www.sport.de/fussball/spanien-primera-division/se20829/2016-2017/ergebnisse-und-tabelle/',
 '1516': 'https://www.sport.de/fussball/spanien-primera-division/se18343/2015-2016/ergebnisse-und-tabelle/',
 '1415': 'https://www.sport.de/fussball/spanien-primera-division/se15380/2014-2015/ergebnisse-und-tabelle/',
 '1314': 'https://www.sport.de/fussball/spanien-primera-division/se11980/2013-2014/ergebnisse-und-tabelle/',
 '1213': 'https://www.sport.de/fussball/spanien-primera-division/se9034/2012-2013/ergebnisse-und-tabelle/',
 '1112': 'https://ww

## Matchdays

### Links

In [None]:
# export
def get_matchday_links(bs, expected_length=34):
    matchday_links = []
    for li in bs.find_all('li'):
        try:
            matchday_links.append(BASE_URL+li['data'])
        except:
            pass

    if expected_length is not None:
        assert len(matchday_links)==expected_length, f'Expected: {expected_length}. Actual: {len(matchday_links)}'
    return matchday_links

In [None]:
matchday_links = get_matchday_links(league_example_bs, expected_length=None)
matchday_links

['https://www.sport.de/fussball/spanien-primera-division/se18343/2015-2016/ro57057/spieltag/md1/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/spanien-primera-division/se18343/2015-2016/ro57057/spieltag/md2/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/spanien-primera-division/se18343/2015-2016/ro57057/spieltag/md3/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/spanien-primera-division/se18343/2015-2016/ro57057/spieltag/md4/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/spanien-primera-division/se18343/2015-2016/ro57057/spieltag/md5/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/spanien-primera-division/se18343/2015-2016/ro57057/spieltag/md6/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/spanien-primera-division/se18343/2015-2016/ro57057/spieltag/md7/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/spanien-primera-division/se18343/2015-2016/ro57057/spieltag/md8/ergebnisse-und-tabelle/',
 'https://www.sport.de/f

### Dates

In [None]:
# export
def get_matchday_dates(matchday_bs):
    divs = matchday_bs.find_all('div', {'class': 'match-date'})
    dates = [div.text.split(' ')[0] for div in divs]
    dates = list(set(map(lambda d: datetime.strptime(d, '%d.%m.%Y'), dates)))
    return dates

In [None]:
example_matchday_bs = cache(matchday_links[0], f'sportde_matchday_example_{cache_prefix}')
get_matchday_dates(example_matchday_bs)

[datetime.datetime(2015, 8, 23, 0, 0),
 datetime.datetime(2015, 8, 22, 0, 0),
 datetime.datetime(2015, 8, 21, 0, 0),
 datetime.datetime(2015, 8, 24, 0, 0)]

## Matches

In [None]:
# export
def get_match_infos(matchday_bs):
    """
    Returns [[home_team, away_team, home_goals, away_goals], ...]
    """
    divs = [div for div in matchday_bs.find_all('div') if 'position' in div.attrs.keys()]
    matches = []
    for div in divs:
        home_team = div.find('div', {'class', 'team-shortname-home'}).text
        away_team = div.find('div', {'class', 'team-shortname-away'}).text
        
        home_div = div.find('div', {'class', 'match-result match-result-home'})
        home_goals = int(home_div.find('div', {'class': 'match-result match-result-0'}).text)
        
        away_div = div.find('div', {'class', 'match-result match-result-away'})
        away_goals = int(away_div.find('div', {'class': 'match-result match-result-0'}).text)

        match = [home_team, away_team, home_goals, away_goals]
        matches.append(match)
        
    return matches

In [None]:
get_match_infos(example_matchday_bs)

[['Málaga', 'FC Sevilla', 0, 0],
 ['Espanyol', 'Getafe', 1, 0],
 ['Deportivo', 'S. Sebastian', 0, 0],
 ['Atlético', 'UD Las Palmas', 1, 0],
 ['Rayo', 'Valencia', 0, 0],
 ['Athletic', 'Barcelona', 0, 1],
 ['Sporting Gijón', 'Real Madrid', 0, 0],
 ['Levante', 'RC Celta', 1, 2],
 ['Betis Sevilla', 'Villarreal', 1, 1],
 ['Granada CF', 'Eibar', 1, 3]]

## Standings

In [None]:
# export
def get_standings(matchday_bs):
    """
    Returns [[rank, team, games_played, wins, draw, lost, total_goals_scored, total_goals_received, points], ...]
    """
    standing_divs = matchday_bs.find_all('tr', {'class': 'standing'})
    standings = []

    for div in standing_divs:
        rank = div.find('td', {'class': 'standing-rank'}).text
        team = div.find('td', {'class': 'team-shortname'}).text
        games_played = div.find('td', {'class': 'standing-games_played'}).text

        wins = div.find('td', {'class': 'standing-win'}).text
        draw = div.find('td', {'class': 'standing-draw'}).text
        lost = div.find('td', {'class': 'standing-lost'}).text

        total_goals = div.find('td', {'class': 'standing-goaldiff'}).text
        total_goals_scored, total_goals_received = total_goals.split(':')

        points = div.find('td', {'class': 'standing-points'}).text

        standing = [rank, team, games_played, wins, draw, lost, total_goals_scored, total_goals_received, points]
        standings.append(standing)

    return standings

In [None]:
get_standings(example_matchday_bs)

[['1', 'RC Celta', '1', '1', '0', '0', '2', '1', '3'],
 ['2', 'Atlético', '1', '1', '0', '0', '1', '0', '3'],
 ['2', 'Barcelona', '1', '1', '0', '0', '1', '0', '3'],
 ['4', 'Espanyol', '1', '1', '0', '0', '1', '0', '3'],
 ['5', 'Eibar', '1', '1', '0', '0', '3', '1', '3'],
 ['6', 'Betis Sevilla', '1', '0', '1', '0', '1', '1', '1'],
 ['6', 'Villarreal', '1', '0', '1', '0', '1', '1', '1'],
 ['8', 'Deportivo', '1', '0', '1', '0', '0', '0', '1'],
 ['8', 'Málaga', '1', '0', '1', '0', '0', '0', '1'],
 ['8', 'Rayo', '1', '0', '1', '0', '0', '0', '1'],
 ['8', 'Real Madrid', '1', '0', '1', '0', '0', '0', '1'],
 ['8', 'S. Sebastian', '1', '0', '1', '0', '0', '0', '1'],
 ['8', 'FC Sevilla', '1', '0', '1', '0', '0', '0', '1'],
 ['14', 'Valencia', '1', '0', '1', '0', '0', '0', '1'],
 ['15', 'Sporting Gijón', '1', '0', '1', '0', '0', '0', '1'],
 ['16', 'Levante', '1', '0', '0', '1', '1', '2', '0'],
 ['17', 'Athletic', '1', '0', '0', '1', '0', '1', '0'],
 ['17', 'Getafe', '1', '0', '0', '1', '0', '1',

## Scrape entire season

In [None]:
# export
def scrape_season(season, season_link, expected_matchdays=None, do_cache=False):
    season_bs = get_html(season_link)
    matchday_links = get_matchday_links(season_bs, expected_length=expected_matchdays)

    standings = []
    matchdays = []
    games = []

    for matchday_link in matchday_links:
        matchday = re.search('/md([^/]+)/', matchday_link).group(1)
        if do_cache: matchday_bs = cache(matchday_link, f'scrape_{season}_{matchday}_{cache_prefix}')
        else: matchday_bs = get_html(matchday_link)

        dates = get_matchday_dates(matchday_bs)
        matchdays.append([season, matchday, min(dates), max(dates)])

        day_standings = get_standings(matchday_bs)
        for day_standing in day_standings:
            row = [season, matchday, *day_standing]
            standings.append(row)

        day_games = get_match_infos(matchday_bs)
        for day_game in day_games:
            row = [season, matchday, *day_game]
            games.append(row)
            
    return standings, matchdays, games

In [None]:
standings, matchdays, games = scrape_season('2021', season_urls['2021'], do_cache=True)

In [None]:
# export
def set_dtypes(df, dtypes):
    n_cols = df.shape[1]
    for col,dtype in enumerate(dtypes):
        df.iloc[:,col] = df.iloc[:,col].astype(dtype)
    return df

def save_season(standings, matchdays, games, league, save_path):
    save_path.mkdir(parents=True, exist_ok=True)
    
    matchday_df = pd.DataFrame(matchdays, columns=['season', 'matchday', 'start_date', 'end_date'])
    matchday_df.matchday = matchday_df.matchday.astype(int)
    matchday_df['league'] = league
    
    season = matchday_df['season'].unique()
    assert len(season)==1, f'Expected DataFrame of only one season, found: {season}'
    season = season[0]
    
    matchday_path = save_path/f'{season}_matchdays.feather'
    matchday_df.to_feather(matchday_path)
    
    standings_df = pd.DataFrame(standings, columns=['season', 'matchday', 'rank', 'team', 'games_played', 'wins', 'draw', 'lost', 'total_goals_scored', 'total_goals_received', 'points'])
    standings_df = set_dtypes(standings_df, [str, int, int, str, int, int, int, int, int, int, int])
    standings_df['league'] = league
    standings_path = save_path/f'{season}_standings.feather'
    standings_df.to_feather(standings_path)
    
    games_df = pd.DataFrame(games, columns=['season', 'matchday', 'home_team', 'away_team', 'home_goals', 'away_goals'])
    games_df = set_dtypes(games_df, [str, int, str, str, int, int])
    games_df['league'] = league
    games_path = save_path/f'{season}_games.feather'
    games_df.to_feather(games_path)
    
    return matchday_path, standings_path, games_path

In [None]:
paths = save_season(standings, matchdays, games, 'premier_league', Path('../data/'))

for path in paths:
    assert path.is_file()
    os.remove(path)