In [144]:
import requests
from bs4 import BeautifulSoup
from lxml import etree
from urllib.parse import urljoin
import pandas as pd

# Index range for all 34 matchdays each season
matchday_ind_ls = range(1, 35)

# Index range for all season years from 1994 (94/95) to 2022 (22/23)
season_ind_ls = range(1994, 2023)

# Index range for all 18 clubs
clubs_ind_ls = range(1,19)


# Base URL of Transfermarkt Webpage
BASE_URL = 'https://www.transfermarkt.de/'

# header config for Browser setup
headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}


def CLUBS_OVERVIEW_URL(season):
    """
    Reads the url for overview of all clubs for specific season

    Args:
        season (int): Accept years in form of yyyy

    Returns:
        response object: Provides methods and attributes to access the data returned by the HTTP request
    """
    url = urljoin(BASE_URL, f'bundesliga/startseite/wettbewerb/L1/plus/?saison_id={season}')
    response = requests.get(url, headers=headers)
    print(url)
    return response

# Overview over all match results of each season and matchdays
def MATCH_RESULTS_URL(season, matchday):
    """
    Reads the url for all match results of a specific season and a specific matchday
    
    Args:
        season (int): Accept years in form of yyyy
        matchday (int): Accepts an index from 1 to 18 for the number of matchdays

    Returns:
        response object: Provides methods and attributes to access the data returned by the HTTP request
    """
    url = urljoin(BASE_URL, f'bundesliga/spieltag/wettbewerb/L1/plus/?saison_id={season}&spieltag={matchday}')
    response = requests.get(url, headers=headers)
    print(url)
    return response



def dict_to_df(col_name_ls, value_ls):
    """
    Reads the column names and their values in form of lists. Store it into one dictionary to convert it to a Dataframe format 
    
    Args:
        col_name_ls (list['str']): list of column names
        value_ls (list('list')): list of lists with the values for each column

    Returns:
        df (DataFrame):  
    """
    dict = {}
    for enum in range(len(col_name_ls)):
        dict[col_name_ls[enum]] = value_ls[enum]
    df = pd.DataFrame(dict)
    return df

# Clubs Overview over all seasons

In [145]:
# Loop through all season for clubs overview
df_clubs_overview = pd.DataFrame()
for season in season_ind_ls:
    response = CLUBS_OVERVIEW_URL(season)
    soup = BeautifulSoup(response.content, 'lxml')

    # Clubnames 
    clubs_ls = []
    clubs = soup.find('div', id='yw1').find_all('td', class_='hauptlink no-border-links')
    for club in clubs:
        clubs_ls.append(club.text.strip())

    # The number of players in the pool
    players_count_ls = []
    players_count = soup.find('div', id='yw1').find_all('td', class_ = 'zentriert')
    for player_count in players_count:
        for item in player_count.find_all('a'):
            if item.text != '':
                players_count_ls.append(item.text.strip())

    # Average age of all players within a club
    players_avg_age_ls = []
    for row in clubs_ind_ls:
        dom = etree.HTML(str(soup))
        value = dom.xpath(f"//*[@id='yw1']/table/tbody/tr[{row}]/td[4]")[0].text
        players_avg_age_ls.append(value.strip())
    players_avg_age_ls

    # Number of legionaries within a club
    legionaries_ls = []
    for row in clubs_ind_ls:
        dom = etree.HTML(str(soup))
        value = dom.xpath(f"//*[@id='yw1']/table/tbody/tr[{row}]/td[5]")[0].text
        legionaries_ls.append(value.strip())
    legionaries_ls

    # Average market value of each club
    avg_market_value_ls = []
    for row in clubs_ind_ls:
        dom = etree.HTML(str(soup))
        value = dom.xpath(f"//*[@id='yw1']/table/tbody/tr[{row}]/td[6]")[0].text
        avg_market_value_ls.append(value.strip())
    avg_market_value_ls

    # Total market value of each club
    total_market_values_ls =[]
    total_market_values = soup.find('table', class_='items').find_all('td', class_="rechts")
    for item1 in total_market_values:
        for item2 in item1.find_all('a'):
            total_market_values_ls.append(item2.text.strip())
    total_market_values_ls


    # Write all Information to a Dataframe
    col_name_ls = ['CLUB_NAME', 'PLAYERS_COUNT', 'PLAYERS_AVG_AGE', 'LEGIONARIES_COUNT', 'AVG_MARKET_VALUE', 'TOTAL_MARKET_VALUE']
    value_ls = [clubs_ls, players_count_ls, players_avg_age_ls, legionaries_ls, avg_market_value_ls, total_market_values_ls]
    df = dict_to_df(col_name_ls=col_name_ls, value_ls=value_ls) 

    # add season information as column to dataframe
    df['season'] = f"{season}/{season+1}"

    # Concat all dataframes of each season into one dataframe 
    df_clubs_overview = pd.concat([df_clubs_overview, df])

https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=1994
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=1995
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=1996
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=1997
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=1998
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=1999
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2000
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2001
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2002
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2003
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2004
https://www.transfermarkt.de/bundesliga/startseite/wet

In [146]:
df_clubs_overview

Unnamed: 0,CLUB_NAME,PLAYERS_COUNT,PLAYERS_AVG_AGE,LEGIONARIES_COUNT,AVG_MARKET_VALUE,TOTAL_MARKET_VALUE,season
0,FC Schalke 04,28,254,7,-,-,1994/1995
1,SV Werder Bremen,26,286,5,-,-,1994/1995
2,1.FC Kaiserslautern,26,273,5,-,-,1994/1995
3,Hamburger SV,29,264,9,-,-,1994/1995
4,Bayer 05 Uerdingen,24,251,6,-,-,1994/1995
...,...,...,...,...,...,...,...
13,1.FC Köln,39,249,15,"2,97 Mio. €","115,65 Mio. €",2022/2023
14,Hertha BSC,41,254,21,"2,29 Mio. €","94,00 Mio. €",2022/2023
15,SV Werder Bremen,33,247,10,"2,71 Mio. €","89,50 Mio. €",2022/2023
16,FC Schalke 04,43,264,22,"2,04 Mio. €","87,78 Mio. €",2022/2023


# Retrieve matchday data

In [147]:
response = MATCH_RESULTS_URL(2022, 34)
soup = BeautifulSoup(response.content, 'lxml')

https://www.transfermarkt.de/bundesliga/spieltag/wettbewerb/L1/plus/?saison_id=2022&spieltag=34


In [148]:
tipp_list = []
match_tipps = soup.find_all('tr', class_='no-border tm-user-tendenz')
for tipp in match_tipps:
    tipp_list.append(tipp.text.strip())

In [149]:
tipp_list

['93,9 %\n\n\n4,6 %\n\n\n1,5 %',
 '83,5 %\n\n\n7,8 %\n\n\n8,7 %',
 '86,8 %\n\n\n10,0 %\n\n\n3,2 %',
 '5,8 %\n\n\n11,5 %\n\n\n82,6 %',
 '68,9 %\n\n\n22,7 %\n\n\n8,3 %',
 '31,4 %\n\n\n37,1 %\n\n\n31,5 %',
 '93,6 %\n\n\n3,6 %\n\n\n2,8 %',
 '13,0 %\n\n\n16,5 %\n\n\n70,5 %',
 '70,9 %\n\n\n21,0 %\n\n\n8,0 %']

In [150]:
import re
def extract_betting_percentages(word: str):
    """Extracts the betting percentages for win home team, remis and win away team"""
    # Define the regular expression pattern
    pattern = r'([\d,]+) %'

    # Find all matches using the pattern
    matches = re.findall(pattern, word)

    # Convert the matched values to floats
    return [float(match.replace(',', '.')) for match in matches] 

In [151]:
tipps = []

for tipp in tipp_list:
    win_perc_home, remis_perc, win_perc_away = extract_betting_percentages(tipp)
    tipps.append((win_perc_home, remis_perc, win_perc_away))
tipps

[(93.9, 4.6, 1.5),
 (83.5, 7.8, 8.7),
 (86.8, 10.0, 3.2),
 (5.8, 11.5, 82.6),
 (68.9, 22.7, 8.3),
 (31.4, 37.1, 31.5),
 (93.6, 3.6, 2.8),
 (13.0, 16.5, 70.5),
 (70.9, 21.0, 8.0)]

In [152]:
# retrieve all home teams for given matchday
home_list = []
home_teams = soup.find_all('td', class_='rechts hauptlink no-border-rechts hide-for-small spieltagsansicht-vereinsname')
for team in home_teams:
    home_list.append(team.text.strip())

In [153]:
home_list

['(1.)\xa0\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tBor. Dortmund',
 '(3.)\xa0\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tRB Leipzig',
 '(4.)\xa0\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tUnion Berlin',
 '(10.)\xa0\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t1.FC Köln',
 "(11.)\xa0\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tBor. M'gladbach",
 '(8.)\xa0\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tE. Frankfurt',
 '(7.)\xa0\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tVfL Wolfsburg',
 '(16.)\xa0\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tVfL Bochum',
 '(15.)\xa0\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tVfB Stuttgart']

In [154]:
def extract_club_standings(word: str, pattern: str, home: bool):
    """Extracts club and standing from given word"""
    # Find the match using the pattern
    match = re.match(pattern, word)

    if match:
        if home:
            number = match.group(1)
            club = match.group(2)
        else:
            club = match.group(1)
            number = match.group(2)

        #print(number)
        #print(club)
        return number, club
    else:
        print("No match found.")
        return None, None

In [155]:
home = []

pattern = r'^\((\d+)\.\)\s+(.+)$'
for team in home_list:
    number, club = extract_club_standings(team, pattern, home=True)
    home.append((club, number))
home

[('Bor. Dortmund', '1'),
 ('RB Leipzig', '3'),
 ('Union Berlin', '4'),
 ('1.FC Köln', '10'),
 ("Bor. M'gladbach", '11'),
 ('E. Frankfurt', '8'),
 ('VfL Wolfsburg', '7'),
 ('VfL Bochum', '16'),
 ('VfB Stuttgart', '15')]

In [156]:
# retrieve all guest teams for given matchday

away_list = []
teams_away = soup.find_all('td', class_='hauptlink no-border-links no-border-rechts hide-for-small spieltagsansicht-vereinsname')
for team in teams_away:
    away_list.append(team.text.strip())
away_list

['1.FSV Mainz 05\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\xa0(9.)',
 'FC Schalke 04\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\xa0(17.)',
 'Werder Bremen\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\xa0(12.)',
 'Bayern München\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\xa0(2.)',
 'FC Augsburg\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\xa0(14.)',
 'SC Freiburg\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\xa0(5.)',
 'Hertha BSC\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\xa0(18.)',
 'B. Leverkusen\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\xa0(6.)',
 'TSG Hoffenheim\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\xa0(13.)']

In [157]:
away = []

pattern = r'^(.+)\s+\((\d+)\.\)$'
for team in away_list:
    number, club = extract_club_standings(team, pattern, home=False)
    away.append((club.replace('\t', ''), number))
away

[('1.FSV Mainz 05', '9'),
 ('FC Schalke 04', '17'),
 ('Werder Bremen', '12'),
 ('Bayern München', '2'),
 ('FC Augsburg', '14'),
 ('SC Freiburg', '5'),
 ('Hertha BSC', '18'),
 ('B. Leverkusen', '6'),
 ('TSG Hoffenheim', '13')]

In [158]:
# create dataframe with match data
columns = ["HOME_TEAM", "PLACE_HOME_TEAM", "AWAY_TEAM", "PLACE_AWAY_TEAM","WIN_PERC_HOME", 
           "REMIS_PERC", "WIN_PERC_AWAY"]
matches = [t1 + t2 + t3 for t1, t2, t3 in zip(home, away, tipps)]
df = pd.DataFrame(matches, columns=columns)
df.head()

Unnamed: 0,HOME_TEAM,PLACE_HOME_TEAM,AWAY_TEAM,PLACE_AWAY_TEAM,WIN_PERC_HOME,REMIS_PERC,WIN_PERC_AWAY
0,Bor. Dortmund,1,1.FSV Mainz 05,9,93.9,4.6,1.5
1,RB Leipzig,3,FC Schalke 04,17,83.5,7.8,8.7
2,Union Berlin,4,Werder Bremen,12,86.8,10.0,3.2
3,1.FC Köln,10,Bayern München,2,5.8,11.5,82.6
4,Bor. M'gladbach,11,FC Augsburg,14,68.9,22.7,8.3
