In [1]:
%load_ext autoreload
%autoreload 2

In [30]:
from betting.scraping import *

import urllib.request
import re
from dataclasses import dataclass

# Scraping for bundesliga data

In [3]:
base_url = 'https://www.sport.de'
example_url = base_url + '/fussball/deutschland-bundesliga/se580/2006-2007/ergebnisse-und-tabelle'

In [4]:
bs = get_html(example_url)

## Find season links

In [5]:
select_element = bs.find_all('select', {'class': 'navigation season-navigation'})[0]
seasons = {}

for option in select_element.find_all('option'):
    match = re.match(r'''/fussball/deutschland-bundesliga/.+/(\d\d\d\d-\d\d\d\d)/ergebnisse-und-tabelle/''', option['value'])
    if match is None:
        print(option['value'])
        continue
    url = match.group(0)
    season = match.group(1)
    season = season[2] + season[3] + season[7] + season[8]
    seasons[season] = url

In [6]:
seasons

{'2021': '/fussball/deutschland-bundesliga/se35753/2020-2021/ergebnisse-und-tabelle/',
 '1920': '/fussball/deutschland-bundesliga/se31723/2019-2020/ergebnisse-und-tabelle/',
 '1819': '/fussball/deutschland-bundesliga/se28567/2018-2019/ergebnisse-und-tabelle/',
 '1718': '/fussball/deutschland-bundesliga/se23906/2017-2018/ergebnisse-und-tabelle/',
 '1617': '/fussball/deutschland-bundesliga/se20812/2016-2017/ergebnisse-und-tabelle/',
 '1516': '/fussball/deutschland-bundesliga/se18336/2015-2016/ergebnisse-und-tabelle/',
 '1415': '/fussball/deutschland-bundesliga/se15388/2014-2015/ergebnisse-und-tabelle/',
 '1314': '/fussball/deutschland-bundesliga/se11976/2013-2014/ergebnisse-und-tabelle/',
 '1213': '/fussball/deutschland-bundesliga/se9024/2012-2013/ergebnisse-und-tabelle/',
 '1112': '/fussball/deutschland-bundesliga/se7094/2011-2012/ergebnisse-und-tabelle/',
 '1011': '/fussball/deutschland-bundesliga/se5823/2010-2011/ergebnisse-und-tabelle/',
 '0910': '/fussball/deutschland-bundesliga/se1

## Matchday information

### Matchday links

In [7]:
season = seasons['1213']
bs = get_html(base_url+season)

In [8]:
def get_matchday_links(bs, expected_length=34):
    matchday_links = []
    for li in bs.find_all('li'):
        try:
            matchday_links.append(li['data'])
        except:
            pass

    assert len(matchday_links)==expected_length, f'Expected: {expected_length}. Actual: {len(matchday_links)}'
    return matchday_links

In [9]:
matchday_links = get_matchday_links(bs)
matchday_links[:5]

['/fussball/deutschland-bundesliga/se9024/2012-2013/ro29872/spieltag/md1/ergebnisse-und-tabelle/',
 '/fussball/deutschland-bundesliga/se9024/2012-2013/ro29872/spieltag/md2/ergebnisse-und-tabelle/',
 '/fussball/deutschland-bundesliga/se9024/2012-2013/ro29872/spieltag/md3/ergebnisse-und-tabelle/',
 '/fussball/deutschland-bundesliga/se9024/2012-2013/ro29872/spieltag/md4/ergebnisse-und-tabelle/',
 '/fussball/deutschland-bundesliga/se9024/2012-2013/ro29872/spieltag/md5/ergebnisse-und-tabelle/']

### Dates

In [10]:
matchday_link = matchday_links[0]
bs = get_html(base_url+matchday_link)

In [11]:
def get_matchday_dates(matchday_bs):
    divs = bs.find_all('div', {'class': 'match-date'})
    dates = [div.text.split(' ')[0] for div in divs]
    return dates

In [12]:
get_matchday_dates(bs)

['24.08.2012', '25.08.2012', '25.08.2012', '25.08.2012', '26.08.2012']

In [13]:
matchday_link

'/fussball/deutschland-bundesliga/se9024/2012-2013/ro29872/spieltag/md1/ergebnisse-und-tabelle/'

### Matches

In [79]:
@dataclass
class Match:
    home_team: str
    away_team: str
    home_goals: int
    away_goals: int

In [80]:
divs = [div for div in bs.find_all('div') if 'position' in div.attrs.keys()]
len(divs)

9

In [81]:
def get_match_infos(matchday_bs):
    divs = [div for div in bs.find_all('div') if 'position' in div.attrs.keys()]
    matches = []
    for div in divs:
        home_team = div.find('div', {'class', 'team-shortname-home'}).text
        away_team = div.find('div', {'class', 'team-shortname-away'}).text
        
        home_div = div.find('div', {'class', 'match-result match-result-home'})
        home_goals = int(home_div.find('div', {'class': 'match-result match-result-0'}).text)
        
        away_div = div.find('div', {'class', 'match-result match-result-away'})
        away_goals = int(away_div.find('div', {'class': 'match-result match-result-0'}).text)

        match = Match(home_team, away_team, home_goals, away_goals)
        matches.append(match)
        
    return matches

In [83]:
matches = get_match_infos(bs)
matches

[Match(home_team='Dortmund', away_team='SV Werder', home_goals=2, away_goals=1),
 Match(home_team="M'gladbach", away_team='Hoffenheim', home_goals=2, away_goals=1),
 Match(home_team='SC Freiburg', away_team='Mainz 05', home_goals=1, away_goals=1),
 Match(home_team='Augsburg', away_team='Düsseldorf', home_goals=0, away_goals=2),
 Match(home_team='Hamburger SV', away_team='Nürnberg', home_goals=0, away_goals=1),
 Match(home_team='Gr. Fürth', away_team='FC Bayern', home_goals=0, away_goals=3),
 Match(home_team='Frankfurt', away_team='Leverkusen', home_goals=2, away_goals=1),
 Match(home_team='VfB Stuttgart', away_team='Wolfsburg', home_goals=0, away_goals=1),
 Match(home_team='Hannover 96', away_team='Schalke 04', home_goals=2, away_goals=2)]

### Standings