In [41]:
import requests
from bs4 import BeautifulSoup
from lxml import etree
from urllib.parse import urljoin
import pandas as pd

# Index range for all 34 matchdays each season
matchday_ind_ls = range(1, 35)

# Index range for all season years from 1994 (94/95) to 2022 (22/23)
season_ind_ls = range(1994, 2023)

# Index range for all 18 clubs
clubs_ind_ls = range(1,19)


# Base URL of Transfermarkt Webpage
BASE_URL = 'https://www.transfermarkt.de/'

# header config for Browser setup
headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}


def CLUBS_OVERVIEW_URL(season):
    """
    Reads the url for overview of all clubs for specific season

    Args:
        season (int): Accept years in form of yyyy

    Returns:
        response object: Provides methods and attributes to access the data returned by the HTTP request
    """
    url = urljoin(BASE_URL, f'bundesliga/startseite/wettbewerb/L1/plus/?saison_id={season}')
    response = requests.get(url, headers=headers)
    print(url)
    return response

# Overview over all match results of each season and matchdays
def MATCH_RESULTS_URL(season, matchday):
    """
    Reads the url for all match results of a specific season and a specific matchday
    
    Args:
        season (int): Accept years in form of yyyy
        matchday (int): Accepts an index from 1 to 18 for the number of matchdays

    Returns:
        response object: Provides methods and attributes to access the data returned by the HTTP request
    """
    url = urljoin(BASE_URL, f'bundesliga/spieltag/wettbewerb/L1/plus/?saison_id={season}&spieltag={matchday}')
    response = requests.get(url, headers=headers)
    print(url)
    return response



def dict_to_df(col_name_ls, value_ls):
    """
    Reads the column names and their values in form of lists. Store it into one dictionary to convert it to a Dataframe format 
    
    Args:
        col_name_ls (list['str']): list of column names
        value_ls (list('list')): list of lists with the values for each column

    Returns:
        df (DataFrame):  
    """
    dict = {}
    for enum in range(len(col_name_ls)):
        dict[col_name_ls[enum]] = value_ls[enum]
    df = pd.DataFrame(dict)
    return df

# Clubs Overview over all seasons

In [42]:
# Loop through all season for clubs overview
df_clubs_overview = pd.DataFrame()
for season in season_ind_ls:
    response = CLUBS_OVERVIEW_URL(season)
    soup = BeautifulSoup(response.content, 'lxml')

    # Clubnames 
    clubs_ls = []
    clubs = soup.find('div', id='yw1').find_all('td', class_='hauptlink no-border-links')
    for club in clubs:
        clubs_ls.append(club.text.strip())

    # The number of players in the pool
    players_count_ls = []
    players_count = soup.find('div', id='yw1').find_all('td', class_ = 'zentriert')
    for player_count in players_count:
        for item in player_count.find_all('a'):
            if item.text != '':
                players_count_ls.append(item.text.strip())

    # Average age of all players within a club
    players_avg_age_ls = []
    for row in clubs_ind_ls:
        dom = etree.HTML(str(soup))
        value = dom.xpath(f"//*[@id='yw1']/table/tbody/tr[{row}]/td[4]")[0].text
        players_avg_age_ls.append(value.strip())
    players_avg_age_ls

    # Number of legionaries within a club
    legionaries_ls = []
    for row in clubs_ind_ls:
        dom = etree.HTML(str(soup))
        value = dom.xpath(f"//*[@id='yw1']/table/tbody/tr[{row}]/td[5]")[0].text
        legionaries_ls.append(value.strip())
    legionaries_ls

    # Average market value of each club
    avg_market_value_ls = []
    for row in clubs_ind_ls:
        dom = etree.HTML(str(soup))
        value = dom.xpath(f"//*[@id='yw1']/table/tbody/tr[{row}]/td[6]")[0].text
        avg_market_value_ls.append(value.strip())
    avg_market_value_ls

    # Total market value of each club
    total_market_values_ls =[]
    total_market_values = soup.find('table', class_='items').find_all('td', class_="rechts")
    for item1 in total_market_values:
        for item2 in item1.find_all('a'):
            total_market_values_ls.append(item2.text.strip())
    total_market_values_ls


    # Write all Information to a Dataframe
    col_name_ls = ['CLUB_NAME', 'PLAYERS_COUNT', 'PLAYERS_AVG_AGE', 'LEGIONARIES_COUNT', 'AVG_MARKET_VALUE', 'TOTAL_MARKET_VALUE']
    value_ls = [clubs_ls, players_count_ls, players_avg_age_ls, legionaries_ls, avg_market_value_ls, total_market_values_ls]
    df = dict_to_df(col_name_ls=col_name_ls, value_ls=value_ls) 

    # add season information as column to dataframe
    df['season'] = f"{season}/{season+1}"

    # Concat all dataframes of each season into one dataframe 
    df_clubs_overview = pd.concat([df_clubs_overview, df])

https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=1994
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=1995
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=1996
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=1997
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=1998
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=1999
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2000
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2001
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2002
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2003
https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2004
https://www.transfermarkt.de/bundesliga/startseite/wet

In [43]:
df_clubs_overview

Unnamed: 0,CLUB_NAME,PLAYERS_COUNT,PLAYERS_AVG_AGE,LEGIONARIES_COUNT,AVG_MARKET_VALUE,TOTAL_MARKET_VALUE,season
0,FC Schalke 04,28,254,7,-,-,1994/1995
1,SV Werder Bremen,26,286,5,-,-,1994/1995
2,1.FC Kaiserslautern,26,273,5,-,-,1994/1995
3,Hamburger SV,29,264,9,-,-,1994/1995
4,Bayer 05 Uerdingen,24,251,6,-,-,1994/1995
...,...,...,...,...,...,...,...
13,1.FC Köln,34,248,11,"3,00 Mio. €","101,93 Mio. €",2022/2023
14,Hertha BSC,29,261,16,"3,18 Mio. €","92,35 Mio. €",2022/2023
15,SV Werder Bremen,26,254,9,"3,10 Mio. €","80,55 Mio. €",2022/2023
16,FC Schalke 04,32,273,18,"1,89 Mio. €","60,40 Mio. €",2022/2023
