In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp data.sportde

In [None]:
# export
from betting.scraping import *
import re
from datetime import datetime
import pandas as pd

In [None]:
from pathlib import Path
import os

# Sport.de
> Standings, points, matchday information

Calculating points and standings from raw (and noisy) match data seems to be too prone to error. It's not rare that there are missing matches in 'complete' datasets. Therefore, accurate information about points, standings and matchdays is needed. Thankfully, sports.de offers a clean html representation of such data that is easily parseable.

In [None]:
# export
BASE_URL = 'https://www.sport.de'

In [None]:
league_example_url = BASE_URL + '/fussball/deutschland-bundesliga/se580/2006-2007/ergebnisse-und-tabelle'

league_example_bs = cache(league_example_url, 'league_example')

## Season links

From any result page for a league, links to all the seasons can be extracted.

In [None]:
# export
def season_links_from_page(bs):
    select_element = bs.find_all('select', {'class': 'navigation season-navigation'})[0]
    seasons_urls = {}

    for option in select_element.find_all('option'):
        match = re.match(r'''/fussball/deutschland-bundesliga/.+/(\d\d\d\d-\d\d\d\d)/ergebnisse-und-tabelle/''', option['value'])
        if match is None:
            print(option['value'])
            continue
        url = match.group(0)
        season = match.group(1)
        season = season[2] + season[3] + season[7] + season[8]
        seasons_urls[season] = BASE_URL + url

    return seasons_urls

In [None]:
season_urls = season_links_from_page(league_example_bs)
season_urls

{'2021': 'https://www.sport.de/fussball/deutschland-bundesliga/se35753/2020-2021/ergebnisse-und-tabelle/',
 '1920': 'https://www.sport.de/fussball/deutschland-bundesliga/se31723/2019-2020/ergebnisse-und-tabelle/',
 '1819': 'https://www.sport.de/fussball/deutschland-bundesliga/se28567/2018-2019/ergebnisse-und-tabelle/',
 '1718': 'https://www.sport.de/fussball/deutschland-bundesliga/se23906/2017-2018/ergebnisse-und-tabelle/',
 '1617': 'https://www.sport.de/fussball/deutschland-bundesliga/se20812/2016-2017/ergebnisse-und-tabelle/',
 '1516': 'https://www.sport.de/fussball/deutschland-bundesliga/se18336/2015-2016/ergebnisse-und-tabelle/',
 '1415': 'https://www.sport.de/fussball/deutschland-bundesliga/se15388/2014-2015/ergebnisse-und-tabelle/',
 '1314': 'https://www.sport.de/fussball/deutschland-bundesliga/se11976/2013-2014/ergebnisse-und-tabelle/',
 '1213': 'https://www.sport.de/fussball/deutschland-bundesliga/se9024/2012-2013/ergebnisse-und-tabelle/',
 '1112': 'https://www.sport.de/fussbal

## Matchdays

### Links

In [None]:
# export
def get_matchday_links(bs, expected_length=34):
    matchday_links = []
    for li in bs.find_all('li'):
        try:
            matchday_links.append(BASE_URL+li['data'])
        except:
            pass

    assert len(matchday_links)==expected_length, f'Expected: {expected_length}. Actual: {len(matchday_links)}'
    return matchday_links

In [None]:
matchday_links = get_matchday_links(league_example_bs)
matchday_links

['https://www.sport.de/fussball/deutschland-bundesliga/se580/2006-2007/ro2479/spieltag/md1/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/deutschland-bundesliga/se580/2006-2007/ro2479/spieltag/md2/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/deutschland-bundesliga/se580/2006-2007/ro2479/spieltag/md3/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/deutschland-bundesliga/se580/2006-2007/ro2479/spieltag/md4/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/deutschland-bundesliga/se580/2006-2007/ro2479/spieltag/md5/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/deutschland-bundesliga/se580/2006-2007/ro2479/spieltag/md6/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/deutschland-bundesliga/se580/2006-2007/ro2479/spieltag/md7/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/deutschland-bundesliga/se580/2006-2007/ro2479/spieltag/md8/ergebnisse-und-tabelle/',
 'https://www.sport.de/fussball/deutschland-bundesliga/se580/200

### Dates

In [None]:
# export
def get_matchday_dates(matchday_bs):
    divs = matchday_bs.find_all('div', {'class': 'match-date'})
    dates = [div.text.split(' ')[0] for div in divs]
    dates = list(map(lambda d: datetime.strptime(d, '%d.%m.%Y'), dates))
    return dates

In [None]:
example_matchday_bs = cache(matchday_links[5], 'sportde_matchday_example')
get_matchday_dates(example_matchday_bs)

[datetime.datetime(2006, 9, 29, 0, 0),
 datetime.datetime(2006, 9, 30, 0, 0),
 datetime.datetime(2006, 10, 1, 0, 0)]

## Matches

In [None]:
# export
def get_match_infos(matchday_bs):
    """
    Returns [[home_team, away_team, home_goals, away_goals], ...]
    """
    divs = [div for div in matchday_bs.find_all('div') if 'position' in div.attrs.keys()]
    matches = []
    for div in divs:
        home_team = div.find('div', {'class', 'team-shortname-home'}).text
        away_team = div.find('div', {'class', 'team-shortname-away'}).text
        
        home_div = div.find('div', {'class', 'match-result match-result-home'})
        home_goals = int(home_div.find('div', {'class': 'match-result match-result-0'}).text)
        
        away_div = div.find('div', {'class', 'match-result match-result-away'})
        away_goals = int(away_div.find('div', {'class': 'match-result match-result-0'}).text)

        match = [home_team, away_team, home_goals, away_goals]
        matches.append(match)
        
    return matches

In [None]:
get_match_infos(example_matchday_bs)

[['Dortmund', 'Hannover 96', 2, 2],
 ['SV Werder', "M'gladbach", 3, 0],
 ['Nürnberg', 'Mainz 05', 1, 1],
 ['Bielefeld', 'Cottbus', 3, 1],
 ['Frankfurt', 'Hamburger SV', 2, 2],
 ['Wolfsburg', 'FC Bayern', 1, 0],
 ['Aachen', 'Bochum', 2, 1],
 ['Leverkusen', 'Schalke 04', 3, 1],
 ['Hertha BSC', 'VfB Stuttgart', 2, 2]]

## Standings

In [None]:
# export
def get_standings(matchday_bs):
    """
    Returns [[rank, team, games_played, wins, draw, lost, total_goals_scored, total_goals_received, points], ...]
    """
    standing_divs = matchday_bs.find_all('tr', {'class': 'standing'})
    standings = []

    for div in standing_divs:
        rank = div.find('td', {'class': 'standing-rank'}).text
        team = div.find('td', {'class': 'team-shortname'}).text
        games_played = div.find('td', {'class': 'standing-games_played'}).text

        wins = div.find('td', {'class': 'standing-win'}).text
        draw = div.find('td', {'class': 'standing-draw'}).text
        lost = div.find('td', {'class': 'standing-lost'}).text

        total_goals = div.find('td', {'class': 'standing-goaldiff'}).text
        total_goals_scored, total_goals_received = total_goals.split(':')

        points = div.find('td', {'class': 'standing-points'}).text

        standing = [rank, team, games_played, wins, draw, lost, total_goals_scored, total_goals_received, points]
        standings.append(standing)

    return standings

In [None]:
get_standings(example_matchday_bs)

[['1', 'Hertha BSC', '6', '2', '4', '0', '10', '4', '10'],
 ['2', 'Nürnberg', '6', '2', '4', '0', '7', '3', '10'],
 ['3', 'SV Werder', '6', '3', '1', '2', '12', '9', '10'],
 ['4', 'FC Bayern', '6', '3', '1', '2', '7', '5', '10'],
 ['5', 'Schalke 04', '6', '3', '1', '2', '7', '6', '10'],
 ['6', 'Aachen', '6', '3', '0', '3', '10', '9', '9'],
 ['7', "M'gladbach", '6', '3', '0', '3', '6', '8', '9'],
 ['8', 'Leverkusen', '6', '2', '2', '2', '10', '8', '8'],
 ['9', 'Frankfurt', '6', '1', '5', '0', '8', '6', '8'],
 ['10', 'Dortmund', '6', '2', '2', '2', '7', '7', '8'],
 ['11', 'Cottbus', '6', '2', '2', '2', '7', '8', '8'],
 ['12', 'VfB Stuttgart', '6', '2', '2', '2', '10', '13', '8'],
 ['13', 'Bielefeld', '6', '2', '1', '3', '9', '9', '7'],
 ['14', 'Mainz 05', '6', '1', '4', '1', '6', '7', '7'],
 ['15', 'Wolfsburg', '6', '1', '3', '2', '3', '5', '6'],
 ['16', 'Hamburger SV', '6', '0', '5', '1', '7', '8', '5'],
 ['17', 'Hannover 96', '6', '1', '2', '3', '7', '15', '5'],
 ['18', 'Bochum', '6', 

## Scrape entire season

In [None]:
# export
def scrape_season(season, season_link, do_cache=False):
    season_bs = get_html(season_link)
    matchday_links = get_matchday_links(season_bs)

    standings = []
    matchdays = []
    games = []

    for matchday_link in matchday_links:
        matchday = re.search('/md([^/]+)/', matchday_link).group(1)
        if do_cache: matchday_bs = cache(matchday_link, f'scrape_{season}_{matchday}')
        else: matchday_bs = get_html(matchday_link)

        dates = get_matchday_dates(matchday_bs)
        matchdays.append([season, matchday, min(dates), max(dates)])

        day_standings = get_standings(matchday_bs)
        for day_standing in day_standings:
            row = [season, matchday, *day_standing]
            standings.append(row)

        day_games = get_match_infos(matchday_bs)
        for day_game in day_games:
            row = [season, matchday, *day_game]
            games.append(row)
            
    return standings, matchdays, games

In [None]:
standings, matchdays, games = scrape_season('2021', season_urls['2021'], do_cache=True)

In [None]:
# export
def set_dtypes(df, dtypes):
    n_cols = df.shape[1]
    for col,dtype in enumerate(dtypes):
        df.iloc[:,col] = df.iloc[:,col].astype(dtype)
    return df

def save_season(standings, matchdays, games, league, save_path):
    matchday_df = pd.DataFrame(matchdays, columns=['season', 'matchday', 'start_date', 'end_date'])
    matchday_df.matchday = matchday_df.matchday.astype(int)
    matchday_df['league'] = league
    
    season = matchday_df['season'].unique()
    assert len(season)==1, f'Expected DataFrame of only one season, found: {season}'
    season = season[0]
    
    matchday_path = save_path/f'{season}_matchdays.feather'
    matchday_df.to_feather(matchday_path)
    
    standings_df = pd.DataFrame(standings, columns=['season', 'matchday', 'rank', 'team', 'games_played', 'wins', 'draw', 'lost', 'total_goals_scored', 'total_goals_received', 'points'])
    standings_df = set_dtypes(standings_df, [str, int, int, str, int, int, int, int, int, int, int])
    standings_df['league'] = league
    standings_path = save_path/f'{season}_standings.feather'
    standings_df.to_feather(standings_path)
    
    games_df = pd.DataFrame(games, columns=['season', 'matchday', 'home_team', 'away_team', 'home_goals', 'away_goals'])
    games_df = set_dtypes(games_df, [str, int, str, str, int, int])
    games_df['league'] = league
    games_path = save_path/f'{season}_games.feather'
    games_df.to_feather(games_path)
    
    return matchday_path, standings_path, games_path

In [None]:
paths = save_season(standings, matchdays, games, 'bundesliga', Path('../data/'))

for path in paths:
    assert path.is_file()
    os.remove(path)