In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from betting.scraping import *

import urllib.request
import re
from dataclasses import dataclass
from datetime import datetime
import pandas as pd
from pathlib import Path

# Scraping for bundesliga data

In [3]:
base_url = 'https://www.sport.de'
example_url = base_url + '/fussball/deutschland-bundesliga/se580/2006-2007/ergebnisse-und-tabelle'

In [4]:
bs = get_html(example_url)

## Find season links

In [5]:
select_element = bs.find_all('select', {'class': 'navigation season-navigation'})[0]
seasons_urls = {}

for option in select_element.find_all('option'):
    match = re.match(r'''/fussball/deutschland-bundesliga/.+/(\d\d\d\d-\d\d\d\d)/ergebnisse-und-tabelle/''', option['value'])
    if match is None:
        print(option['value'])
        continue
    url = match.group(0)
    season = match.group(1)
    season = season[2] + season[3] + season[7] + season[8]
    seasons_urls[season] = url

In [6]:
seasons_urls

{'2021': '/fussball/deutschland-bundesliga/se35753/2020-2021/ergebnisse-und-tabelle/',
 '1920': '/fussball/deutschland-bundesliga/se31723/2019-2020/ergebnisse-und-tabelle/',
 '1819': '/fussball/deutschland-bundesliga/se28567/2018-2019/ergebnisse-und-tabelle/',
 '1718': '/fussball/deutschland-bundesliga/se23906/2017-2018/ergebnisse-und-tabelle/',
 '1617': '/fussball/deutschland-bundesliga/se20812/2016-2017/ergebnisse-und-tabelle/',
 '1516': '/fussball/deutschland-bundesliga/se18336/2015-2016/ergebnisse-und-tabelle/',
 '1415': '/fussball/deutschland-bundesliga/se15388/2014-2015/ergebnisse-und-tabelle/',
 '1314': '/fussball/deutschland-bundesliga/se11976/2013-2014/ergebnisse-und-tabelle/',
 '1213': '/fussball/deutschland-bundesliga/se9024/2012-2013/ergebnisse-und-tabelle/',
 '1112': '/fussball/deutschland-bundesliga/se7094/2011-2012/ergebnisse-und-tabelle/',
 '1011': '/fussball/deutschland-bundesliga/se5823/2010-2011/ergebnisse-und-tabelle/',
 '0910': '/fussball/deutschland-bundesliga/se1

## Matchday information

### Matchday links

In [7]:
season = seasons_urls['1213']
bs = get_html(base_url+season)

In [8]:
def get_matchday_links(bs, expected_length=34):
    matchday_links = []
    for li in bs.find_all('li'):
        try:
            matchday_links.append(li['data'])
        except:
            pass

    assert len(matchday_links)==expected_length, f'Expected: {expected_length}. Actual: {len(matchday_links)}'
    return matchday_links

In [9]:
matchday_links = get_matchday_links(bs)
matchday_links[:5]

['/fussball/deutschland-bundesliga/se9024/2012-2013/ro29872/spieltag/md1/ergebnisse-und-tabelle/',
 '/fussball/deutschland-bundesliga/se9024/2012-2013/ro29872/spieltag/md2/ergebnisse-und-tabelle/',
 '/fussball/deutschland-bundesliga/se9024/2012-2013/ro29872/spieltag/md3/ergebnisse-und-tabelle/',
 '/fussball/deutschland-bundesliga/se9024/2012-2013/ro29872/spieltag/md4/ergebnisse-und-tabelle/',
 '/fussball/deutschland-bundesliga/se9024/2012-2013/ro29872/spieltag/md5/ergebnisse-und-tabelle/']

### Dates

In [10]:
matchday_link = matchday_links[0]
bs = get_html(base_url+matchday_link)

In [11]:
def get_matchday_dates(matchday_bs):
    divs = bs.find_all('div', {'class': 'match-date'})
    dates = [div.text.split(' ')[0] for div in divs]
    dates = list(map(lambda d: datetime.strptime(d, '%d.%m.%Y'), dates))
    return dates

In [12]:
dates = get_matchday_dates(bs)
dates

[datetime.datetime(2012, 8, 24, 0, 0),
 datetime.datetime(2012, 8, 25, 0, 0),
 datetime.datetime(2012, 8, 25, 0, 0),
 datetime.datetime(2012, 8, 25, 0, 0),
 datetime.datetime(2012, 8, 26, 0, 0)]

In [13]:
matchday_link

'/fussball/deutschland-bundesliga/se9024/2012-2013/ro29872/spieltag/md1/ergebnisse-und-tabelle/'

### Matches

In [14]:
def get_match_infos(matchday_bs):
    divs = [div for div in matchday_bs.find_all('div') if 'position' in div.attrs.keys()]
    matches = []
    for div in divs:
        home_team = div.find('div', {'class', 'team-shortname-home'}).text
        away_team = div.find('div', {'class', 'team-shortname-away'}).text
        
        home_div = div.find('div', {'class', 'match-result match-result-home'})
        home_goals = int(home_div.find('div', {'class': 'match-result match-result-0'}).text)
        
        away_div = div.find('div', {'class', 'match-result match-result-away'})
        away_goals = int(away_div.find('div', {'class': 'match-result match-result-0'}).text)

        match = [home_team, away_team, home_goals, away_goals]
        matches.append(match)
        
    return matches

In [15]:
matches = get_match_infos(bs)
matches

[['Dortmund', 'SV Werder', 2, 1],
 ["M'gladbach", 'Hoffenheim', 2, 1],
 ['SC Freiburg', 'Mainz 05', 1, 1],
 ['Augsburg', 'Düsseldorf', 0, 2],
 ['Hamburger SV', 'Nürnberg', 0, 1],
 ['Gr. Fürth', 'FC Bayern', 0, 3],
 ['Frankfurt', 'Leverkusen', 2, 1],
 ['VfB Stuttgart', 'Wolfsburg', 0, 1],
 ['Hannover 96', 'Schalke 04', 2, 2]]

### Standings

In [16]:
def get_standings(matchday_bs):
    standing_divs = matchday_bs.find_all('tr', {'class': 'standing'})
    standings = []

    for div in standing_divs:
        rank = div.find('td', {'class': 'standing-rank'}).text
        team = div.find('td', {'class': 'team-shortname'}).text
        games_played = div.find('td', {'class': 'standing-games_played'}).text

        wins = div.find('td', {'class': 'standing-win'}).text
        draw = div.find('td', {'class': 'standing-draw'}).text
        lost = div.find('td', {'class': 'standing-lost'}).text

        total_goals = div.find('td', {'class': 'standing-goaldiff'}).text
        total_goals_scored, total_goals_received = total_goals.split(':')

        points = div.find('td', {'class': 'standing-points'}).text

        standing = [rank, team, games_played, wins, draw, lost, total_goals_scored, total_goals_received, points]
        standings.append(standing)

    return standings

In [17]:
get_standings(bs)

[['1', 'FC Bayern', '1', '1', '0', '0', '3', '0', '3'],
 ['2', 'Düsseldorf', '1', '1', '0', '0', '2', '0', '3'],
 ['3', "M'gladbach", '1', '1', '0', '0', '2', '1', '3'],
 ['3', 'Dortmund', '1', '1', '0', '0', '2', '1', '3'],
 ['3', 'Frankfurt', '1', '1', '0', '0', '2', '1', '3'],
 ['6', 'Nürnberg', '1', '1', '0', '0', '1', '0', '3'],
 ['6', 'Wolfsburg', '1', '1', '0', '0', '1', '0', '3'],
 ['8', 'Schalke 04', '1', '0', '1', '0', '2', '2', '1'],
 ['8', 'Hannover 96', '1', '0', '1', '0', '2', '2', '1'],
 ['10', 'Mainz 05', '1', '0', '1', '0', '1', '1', '1'],
 ['10', 'SC Freiburg', '1', '0', '1', '0', '1', '1', '1'],
 ['12', 'Hoffenheim', '1', '0', '0', '1', '1', '2', '0'],
 ['12', 'Leverkusen', '1', '0', '0', '1', '1', '2', '0'],
 ['12', 'SV Werder', '1', '0', '0', '1', '1', '2', '0'],
 ['15', 'Hamburger SV', '1', '0', '0', '1', '0', '1', '0'],
 ['15', 'VfB Stuttgart', '1', '0', '0', '1', '0', '1', '0'],
 ['17', 'Augsburg', '1', '0', '0', '1', '0', '2', '0'],
 ['18', 'Gr. Fürth', '1', '0

# Scrape

In [24]:
def scrape_season(season):
    season_link = seasons_urls[season]
    season_bs = get_html(base_url+season_link)
    matchday_links = get_matchday_links(season_bs)

    standings = []
    matchdays = []
    games = []

    for matchday_link in matchday_links:
        matchday = re.search('/md([^/]+)/', matchday_link).group(1)
        print(matchday)

        matchday_bs = get_html(base_url+matchday_link)

        dates = get_matchday_dates(matchday_bs)
        matchdays.append([season, matchday, min(dates), max(dates)])

        day_standings = get_standings(matchday_bs)
        for day_standing in day_standings:
            row = [season, matchday, *day_standing]
            standings.append(row)

        day_games = get_match_infos(matchday_bs)
        for day_game in day_games:
            row = [season, matchday, *day_game]
            games.append(row)
            
    return standings, matchdays, games

## Save

In [25]:
save_path = Path('../../data/sportde/bundesliga')

In [30]:
def set_dtypes(df, dtypes):
    n_cols = df.shape[1]
    for col,dtype in enumerate(dtypes):
        df.iloc[:,col] = df.iloc[:,col].astype(dtype)
    return df

def save_season(standings, matchdays, games, league):
    matchday_df = pd.DataFrame(matchdays, columns=['season', 'matchday', 'start_date', 'end_date'])
    matchday_df.matchday = matchday_df.matchday.astype(int)
    matchday_df['league'] = league
    matchday_df.to_feather(save_path/f'{season}_matchdays.feather')
    
    standings_df = pd.DataFrame(standings, columns=['season', 'matchday', 'rank', 'team', 'games_played', 'wins', 'draw', 'lost', 'total_goals_scored', 'total_goals_received', 'points'])
    standings_df = set_dtypes(standings_df, [str, int, int, str, int, int, int, int, int, int, int])
    standings_df['league'] = league
    standings_df.to_feather(save_path/f'{season}_standings.feather')
    
    games_df = pd.DataFrame(games, columns=['season', 'matchday', 'home_team', 'away_team', 'home_goals', 'away_goals'])
    games_df = set_dtypes(games_df, [str, int, str, str, int, int])
    games_df['league'] = league
    games_df.to_feather(save_path/f'{season}_games.feather')

## Run

In [28]:
seasons = ['0809', '0910', '1011', '1112', '1213', '1314', '1415', '1516', '1617', '1718', '1819', '1920']

In [29]:
for season in seasons:
    print(season)
    standings, matchdays, games = scrape_season(season)
    save_season(standings, matchdays, games, season)

0809
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
0910
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
1011
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
1112
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
1213
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
1314
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
1415
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
1516
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
1617
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
1718
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
1819
1
2
3
4
5
6
7
8