In [53]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd

In [54]:
# The names of the leagues for which I want to scrape data as they are contained in the url at weltfussball.de:
leagues = ['fra-ligue-1',
           'eng-premier-league',
           'esp-primera-division',
           'ita-serie-a',
           'bundesliga',
           'mex-primera-division',
           'bra-serie-a',
           'arg-primera-division',
           '2-bundesliga',
           'eng-championship',
           'por-primeira-liga',
           'chn-super-league',
           'aus-a-league',
           'ita-serie-b']

In [55]:
# Function that gets the names of the seasons as they are contained in the url depending on the league name:
def get_season(league):
    if league in ['chn-super-league', 'bra-serie-a']:
        seasons = ['2017', '2018', '2019']
    else:
        seasons = ['2016-2017', '2017-2018', '2018-2019', '2019-2020']
    return seasons

In [56]:
# Function that gets the url to a list of matches depending on league and season:
def get_path(league, season):
    if league == 'mex-primera-division':
        return requests.get(f'https://www.weltfussball.de/spielplan/mex-primera-division-{season}-clausura-spieltag/1/')
    else:
        return requests.get(f'https://www.weltfussball.de/spielplan/{league}-{season}-spieltag/1/')

In [57]:
# Function that scrapes the results of a match:
def get_title_data(match):
    title_list = match.find_all('title')
    title_list = [title.string for result in title_list for title in result]
    title_list = [re.split('[(,]', title) for title in title_list]
    results = []
    for title in title_list:
        try:
            splitted = [title[0].split(' - ')[0],
                        title[0].split(' - ')[1][:-5],
                        int(title[0].split(' - ')[1][-4]),
                        int(title[0].split(' - ')[1][-2]),
                        title[1][:-10],
                        title[1][-9:],
                        int(title[2].split('.')[0].strip())]
        except:
            continue
        results.append(splitted)
    return results

In [58]:
# Function that creates a list of leading times (lead, behind, draw) by scraping the goal minutes:
def lead(text):
    last_minute = 0
    state = 'draw'
    draw = 0
    home = 0
    away = 0
    for i in range(len(text)):
        if i == 6 and text[i] == '\'keine\'':
            lead_list = [93, 0, 0]
            return lead_list
        elif ' : ' not in text[i]:
            if i == max(range(len(text))):
                if state == 'draw':
                    draw += 92 - last_minute
                elif state == 'home':
                    home += 92 - last_minute
                elif state == 'away':
                    away += 92 - last_minute
        else:
            goal_home = int(text[i].split(':')[0][1])
            goal_away = int(text[i].split(':')[1][1])
            minute = int(text[i+2].split('.')[0].strip('\''))
            if state == 'draw':
                draw += minute - last_minute
                if minute > 45 and last_minute <= 45:
                    draw += 1
            elif state == 'home':
                home += minute - last_minute
                if minute > 45 and last_minute <= 45:
                    home += 1
            elif state == 'away':
                away += minute - last_minute
                if minute > 45 and last_minute <= 45:
                    away += 1
            last_minute = minute
            if goal_home == goal_away:
                state = 'draw'
            elif goal_home > goal_away:
                state = 'home'
            elif goal_home < goal_away:
                state = 'away'
    lead_list = [draw, home, away]
    return lead_list

In [59]:
# Function that converts months from words into numbers:
def get_month(string):
    if string == 'Januar':
        return '01'
    if string == 'Februar':
        return '02'
    if string == 'März':
        return '03'
    if string == 'April':
        return '04'
    if string == 'Mai':
        return '05'
    if string == 'Juni':
        return '06'
    if string == 'Juli':
        return '07'
    if string == 'August':
        return '08'
    if string == 'September':
        return '09'
    if string == 'Oktober':
        return '10'
    if string == 'November':
        return '11'
    if string == 'Dezember':
        return '12'

In [60]:
# Function that converts scraped dates data into date format:
from datetime import date

def get_date(date_list):
    dates = []
    for d in date_list:
        dates.append(d.split(',')[1].split())
    dates = [date(int(d[2].strip('\'')), int(get_month(d[1])), int(d[0].strip('[\'.]'))) for d in dates]
    return dates

In [61]:
# Function that gets leading times:
def get_time(lead_list):
    draw = []
    home = []
    away = []
    for time in lead_list:
        draw.append(time[0])
        home.append(time[1])
        away.append(time[2])
    return draw, home, away

In [62]:
# Creates an empty data frame to which all match data will be appended:
weltfussball = pd.DataFrame([], columns = ['home_team', 'away_team', 'home_score', 'away_score', 'league', 'season', 'Spieltag'])

In [63]:
# Scrapes data from weltfussball.de:
for league in leagues:
    for season in get_season(league):
        get_seas = get_path(league, season)
        seas = BeautifulSoup(get_seas.content, 'html.parser')
        for text in seas.find_all('select', attrs={'name': 'runde'}):
            for line in text.find_all('option'):
                S_get = requests.get(f'''https://www.weltfussball.de/{line['value']}''')
                S = BeautifulSoup(S_get.content, 'html.parser')
                results = S.find(attrs={'class': 'standard_tabelle'})
                a_list = results.find_all('a')
                for a in a_list:
                    if 'spielbericht' in a['href']:
                        match_get = requests.get(f'''https://www.weltfussball.de{a['href']}''')
                        match = BeautifulSoup(match_get.content, 'html.parser')
                        for string in match.find('div', attrs={'class': 'resultat'}).stripped_strings:
                            if '-:-' in repr(string):
                                continue
                            title_data = get_title_data(match)
                            date_list = []
                            lead_list = []
                            string_list = []
                            for text in match.find_all('table', attrs={'class': 'standard_tabelle'}):
                                for string in text.stripped_strings:
                                    string_list.append(repr(string))           
                            date_list.append(string_list[1])
                            lead_list.append(lead(string_list))
                            frame = pd.DataFrame(title_data, columns = ['home_team', 'away_team', 'home_score', 'away_score', 'league', 'season', 'Spieltag'])
                            dates = get_date(date_list)
                            frame['date'] = dates
                            draw, home, away = get_time(lead_list)
                            frame['draw_time'] = draw
                            frame['home_lead'] = home
                            frame['away_lead'] = away
                            weltfussball = weltfussball.append(frame, ignore_index=True)

In [66]:
# Saving the data frame:
weltfussball.to_csv('weltfussball_data.csv')