In [1]:
import requests
from bs4 import BeautifulSoup
from lxml import etree
from urllib.parse import urljoin
import pandas as pd

# Index range for all 34 matchdays each season
matchday_ind_ls = range(1, 35)

# Index range for all season years from 1994 (94/95) to 2022 (22/23)
season_ind_ls = range(1994, 2023)

# Index range for all 18 clubs
clubs_ind_ls = range(1,19)


# Base URL of Transfermarkt Webpage
BASE_URL = 'https://www.transfermarkt.de/'

# header config for Browser setup
headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}


def CLUBS_OVERVIEW_URL(season):
    """
    Reads the url for overview of all clubs for specific season

    Args:
        season (int): Accept years in form of yyyy

    Returns:
        response object: Provides methods and attributes to access the data returned by the HTTP request
    """
    url = urljoin(BASE_URL, f'bundesliga/startseite/wettbewerb/L1/plus/?saison_id={season}')
    response = requests.get(url, headers=headers)
    print(url)
    return response

# Overview over all match results of each season and matchdays
def MATCH_RESULTS_URL(season, matchday):
    """
    Reads the url for all match results of a specific season and a specific matchday
    
    Args:
        season (int): Accept years in form of yyyy
        matchday (int): Accepts an index from 1 to 18 for the number of matchdays

    Returns:
        response object: Provides methods and attributes to access the data returned by the HTTP request
    """
    url = urljoin(BASE_URL, f'bundesliga/spieltag/wettbewerb/L1/plus/?saison_id={season}&spieltag={matchday}')
    response = requests.get(url, headers=headers)
    print(url)
    return response



def dict_to_df(col_name_ls, value_ls):
    """
    Reads the column names and their values in form of lists. Store it into one dictionary to convert it to a Dataframe format 
    
    Args:
        col_name_ls (list['str']): list of column names
        value_ls (list('list')): list of lists with the values for each column

    Returns:
        df (DataFrame):  
    """
    dict = {}
    for enum in range(len(col_name_ls)):
        dict[col_name_ls[enum]] = value_ls[enum]
    df = pd.DataFrame(dict)
    return df

# Clubs Overview over all seasons

In [None]:
# Loop through all season for clubs overview
df_clubs_overview = pd.DataFrame()
for season in season_ind_ls:
    response = CLUBS_OVERVIEW_URL(season)
    soup = BeautifulSoup(response.content, 'lxml')

    # Clubnames 
    clubs_ls = []
    clubs = soup.find('div', id='yw1').find_all('td', class_='hauptlink no-border-links')
    for club in clubs:
        clubs_ls.append(club.text.strip())

    # The number of players in the pool
    players_count_ls = []
    players_count = soup.find('div', id='yw1').find_all('td', class_ = 'zentriert')
    for player_count in players_count:
        for item in player_count.find_all('a'):
            if item.text != '':
                players_count_ls.append(item.text.strip())

    # Average age of all players within a club
    players_avg_age_ls = []
    for row in clubs_ind_ls:
        dom = etree.HTML(str(soup))
        value = dom.xpath(f"//*[@id='yw1']/table/tbody/tr[{row}]/td[4]")[0].text
        players_avg_age_ls.append(value.strip())
    players_avg_age_ls

    # Number of legionaries within a club
    legionaries_ls = []
    for row in clubs_ind_ls:
        dom = etree.HTML(str(soup))
        value = dom.xpath(f"//*[@id='yw1']/table/tbody/tr[{row}]/td[5]")[0].text
        legionaries_ls.append(value.strip())
    legionaries_ls

    # Average market value of each club
    avg_market_value_ls = []
    for row in clubs_ind_ls:
        dom = etree.HTML(str(soup))
        value = dom.xpath(f"//*[@id='yw1']/table/tbody/tr[{row}]/td[6]")[0].text
        avg_market_value_ls.append(value.strip())
    avg_market_value_ls

    # Total market value of each club
    total_market_values_ls =[]
    total_market_values = soup.find('table', class_='items').find_all('td', class_="rechts")
    for item1 in total_market_values:
        for item2 in item1.find_all('a'):
            total_market_values_ls.append(item2.text.strip())
    total_market_values_ls


    # Write all Information to a Dataframe
    col_name_ls = ['CLUB_NAME', 'PLAYERS_COUNT', 'PLAYERS_AVG_AGE', 'LEGIONARIES_COUNT', 'AVG_MARKET_VALUE', 'TOTAL_MARKET_VALUE']
    value_ls = [clubs_ls, players_count_ls, players_avg_age_ls, legionaries_ls, avg_market_value_ls, total_market_values_ls]
    df = dict_to_df(col_name_ls=col_name_ls, value_ls=value_ls) 

    # add season information as column to dataframe
    df['season'] = f"{season}/{season+1}"

    # Concat all dataframes of each season into one dataframe 
    df_clubs_overview = pd.concat([df_clubs_overview, df])

In [None]:
df_clubs_overview

# Retrieve matchday data

In [47]:
import re
def extract_betting_percentages(word: str):
    """Extracts the betting percentages for win home team, remis and win away team"""
    # define the regular expression pattern
    pattern = r'([\d,]+) %'

    # find all matches using the pattern
    matches = re.findall(pattern, word)

    # Convert the matched values to floats
    return [float(match.replace(',', '.')) for match in matches] 

In [48]:
def extract_club_standings(word: str, pattern: str, home: bool):
    """Extracts club and standing from given word"""
    # find the match using the pattern
    match = re.match(pattern, word)

    if match:
        if home:
            number = match.group(1)
            club = match.group(2)
        else:
            club = match.group(1)
            number = match.group(2)
        
        return club, number
    else:
        print("No match found.")
        return None, None

In [49]:
def get_matchday_data(season: int, matchday: int):
    """Retrieve matchday data for given season and matchday"""
    
    response = MATCH_RESULTS_URL(season, matchday)
    soup = BeautifulSoup(response.content, 'lxml')
    
    # retrieve dates for matchdays
    elements = soup.find_all('a')
    element_list = [elem.text.strip() for elem in elements]
    date_pattern = r'\b\d{2}\.\d{2}\.\d{4}\b'
    dates = [date for element in element_list for date in re.findall(date_pattern, element)]
    
    # retrieve tipps for matchdays
    match_tipps = soup.find_all('tr', class_='no-border tm-user-tendenz')
    tipp_list = [tipp.text.strip() for tipp in match_tipps]
    tipps = [tuple(extract_betting_percentages(tipp)) for tipp in tipp_list]
    
    # retrieve all home teams for given matchday
    home_teams = soup.find_all('td', class_='rechts hauptlink no-border-rechts hide-for-small spieltagsansicht-vereinsname')
    home_list = [team.text.strip() for team in home_teams]
    pattern = r'^\((\d+)\.\)\s+(.+)$'
    home = [(extract_club_standings(team, pattern, home=True)) for team in home_list]
    
    # retrieve all guest teams for given matchday
    teams_away = soup.find_all('td', class_='hauptlink no-border-links no-border-rechts hide-for-small spieltagsansicht-vereinsname')
    away_list = [team.text.strip() for team in teams_away]
  
    pattern = r'^(.+)\s+\((\d+)\.\)$'
    away = []
    for team in away_list:
        club, number = extract_club_standings(team, pattern, home=False)
        away.append((club.replace('\t', ''), number))

    # create dataframe with match data
    columns = ["HOME_TEAM", "PLACE_HOME_TEAM", "AWAY_TEAM", "PLACE_AWAY_TEAM","WIN_PERC_HOME", 
               "REMIS_PERC", "WIN_PERC_AWAY"]
    matches = [t1 + t2 + t3 for t1, t2, t3 in zip(home, away, tipps)]
    df = pd.DataFrame(matches, columns=columns)
    
    # retrieve match results
    match_results = soup.find_all('span', class_='matchresult finished')
    result_list = [result.text.strip() for result in match_results]
    
    df['HOME_GOALS'] = [int(element.split(':')[0]) for element in result_list]
    df['AWAY_GOALS'] = [int(element.split(':')[1]) for element in result_list]
    df['RESULT'] = df.apply(lambda row: 'HOME_WIN' if row['HOME_GOALS'] > row['AWAY_GOALS'] else 'AWAY_WIN' if row['HOME_GOALS'] < row['AWAY_GOALS'] else 'DRAW', axis=1)
    
    # retrieve referee for each match
    referees = soup.find_all('a', href = re.compile(r'.*profil/schiedsrichter/.*'))
    df['REFEREE'] = [result.text.strip() for result in referees]
    
    df['DATE'] = dates
    df['DATE'] = pd.to_datetime(df['DATE'], format='%d.%m.%Y')
    df['WEEKDAY'] = df.DATE.dt.weekday
    df['MONTH'] = df.DATE.dt.month
    df['SEASON'] = season
    df['MATCHDAY'] = matchday
    
    # reorder columns
    reorder = ['DATE', 'WEEKDAY', 'MONTH', 'SEASON', 'MATCHDAY']
    df = df[reorder + [col for col in df.columns if col not in reorder]]  
    return df

In [50]:
# Index range for all 34 matchdays each season
matchday_ind_ls = range(1, 35)

# Index range for all season years from 2010 (2010/11) to 2022 (22/23)
season_ind_ls = range(2022, 2023)

df = pd.DataFrame()

for season in season_ind_ls:
    for matchday in matchday_ind_ls:
        df = pd.concat([df, get_matchday_data(season, matchday)])

df.head()

https://www.transfermarkt.de/bundesliga/spieltag/wettbewerb/L1/plus/?saison_id=2022&spieltag=1
https://www.transfermarkt.de/bundesliga/spieltag/wettbewerb/L1/plus/?saison_id=2022&spieltag=2
https://www.transfermarkt.de/bundesliga/spieltag/wettbewerb/L1/plus/?saison_id=2022&spieltag=3
https://www.transfermarkt.de/bundesliga/spieltag/wettbewerb/L1/plus/?saison_id=2022&spieltag=4
https://www.transfermarkt.de/bundesliga/spieltag/wettbewerb/L1/plus/?saison_id=2022&spieltag=5
https://www.transfermarkt.de/bundesliga/spieltag/wettbewerb/L1/plus/?saison_id=2022&spieltag=6
https://www.transfermarkt.de/bundesliga/spieltag/wettbewerb/L1/plus/?saison_id=2022&spieltag=7
https://www.transfermarkt.de/bundesliga/spieltag/wettbewerb/L1/plus/?saison_id=2022&spieltag=8
https://www.transfermarkt.de/bundesliga/spieltag/wettbewerb/L1/plus/?saison_id=2022&spieltag=9
https://www.transfermarkt.de/bundesliga/spieltag/wettbewerb/L1/plus/?saison_id=2022&spieltag=10
https://www.transfermarkt.de/bundesliga/spieltag/

Unnamed: 0,DATE,WEEKDAY,MONTH,SEASON,MATCHDAY,HOME_TEAM,PLACE_HOME_TEAM,AWAY_TEAM,PLACE_AWAY_TEAM,WIN_PERC_HOME,REMIS_PERC,WIN_PERC_AWAY,HOME_GOALS,AWAY_GOALS,RESULT,REFEREE
0,2022-08-05,4,8,2022,1,E. Frankfurt,7,Bayern München,1,8.3,15.1,76.5,1,6,AWAY_WIN,Deniz Aytekin
1,2022-08-06,5,8,2022,1,Union Berlin,4,Hertha BSC,18,77.0,17.1,5.9,3,1,HOME_WIN,Marco Fritz
2,2022-08-06,5,8,2022,1,Bor. M'gladbach,10,TSG Hoffenheim,12,69.6,22.5,7.9,3,1,HOME_WIN,Daniel Siebert
3,2022-08-06,5,8,2022,1,VfL Wolfsburg,8,Werder Bremen,13,75.1,17.6,7.3,2,2,DRAW,Sascha Stegemann
4,2022-08-06,5,8,2022,1,VfL Bochum,14,1.FSV Mainz 05,9,23.2,37.9,38.9,1,2,AWAY_WIN,Felix Zwayer


In [51]:
len(season_ind_ls) * len(matchday_ind_ls) * 9 == df.shape[0]

True