In [1]:
import pandas as pd 
import requests
from bs4 import BeautifulSoup
import time 
import os 

## Scrape club expenditures from transfermarkt

In [2]:
def get_club_expenditure(league:str,season:str):
    # set url
    assert(league in ['premier-league',"laliga","bundesliga","serie-a","primeira-liga","ligue-1"])
    url = ""
    if league == 'premier-league':
        url = f'https://www.transfermarkt.us/premier-league/einnahmenausgaben/wettbewerb/GB1/plus/1?ids=a&sa=&saison_id={season}&saison_id_bis={season}&nat=&pos=&altersklasse=&w_s=&leihe=&intern=0'
    elif league == "laliga":
        url = f"https://www.transfermarkt.us/laliga/einnahmenausgaben/wettbewerb/ES1/ids/a/sa//saison_id/{season}/saison_id_bis/{season}/nat/0/pos//w_s//intern/0/plus/1"
    elif league == "bundesliga":
        url = f"https://www.transfermarkt.us/bundesliga/einnahmenausgaben/wettbewerb/L1/ids/a/sa//saison_id/{season}/saison_id_bis/{season}/nat/0/pos//w_s//intern/0/plus/1"
    elif league == "serie-a":
        url = f"https://www.transfermarkt.us/serie-a/einnahmenausgaben/wettbewerb/IT1/ids/a/sa//saison_id/{season}/saison_id_bis/{season}/nat/0/pos//w_s//intern/0/plus/1"
    elif league  == "primeira-liga":
        url = f"https://www.transfermarkt.com/liga-portugal/einnahmenausgaben/wettbewerb/PO1/ids/a/sa//saison_id/{season}/saison_id_bis/{season}/nat/0/pos//w_s//intern/0/plus/1"
    elif league == "ligue-1":
        url = f"https://www.transfermarkt.us/ligue-1/einnahmenausgaben/wettbewerb/FR1/plus/1?ids=a&sa=&saison_id={season}&saison_id_bis={season}&nat=&pos=&altersklasse=&w_s=&leihe=&intern=0"
    print(f'Scraping for the season {season} and league {league}')
    print(url)

    # set options and read by requests
    headers = {'User-Agent': 
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
    response = requests.get(url,headers=headers)

    # load data and clean
    from io import StringIO
    df = pd.read_html(StringIO(response.text),header=0)[1]
    df.drop(columns=["#","Club"],inplace=True)
    df = df.shift(1,axis=1)
    df.drop(columns=["Club.1"],inplace=True)
    df.rename({"Club.2":"Club"},axis=1,inplace=True)

    #time.sleep(1)

    # make dirs
    if not os.path.exists(f"data/expenditure/{league}"):
        os.makedirs(f"data/expenditure/{league}")
    df.to_csv(f"data/expenditure/{league}/{season}.csv")

In [4]:
# download expenditure
for i in range(2008,2017):
    season = str(i)
    for league in ['premier-league',"laliga","bundesliga","serie-a","primeira-liga","ligue-1"]:
        get_club_expenditure(league,season)

Scraping for the season 2008 and league premier-league
https://www.transfermarkt.us/premier-league/einnahmenausgaben/wettbewerb/GB1/plus/1?ids=a&sa=&saison_id=2008&saison_id_bis=2008&nat=&pos=&altersklasse=&w_s=&leihe=&intern=0
Scraping for the season 2008 and league laliga
https://www.transfermarkt.us/laliga/einnahmenausgaben/wettbewerb/ES1/ids/a/sa//saison_id/2008/saison_id_bis/2008/nat/0/pos//w_s//intern/0/plus/1
Scraping for the season 2008 and league bundesliga
https://www.transfermarkt.us/bundesliga/einnahmenausgaben/wettbewerb/L1/ids/a/sa//saison_id/2008/saison_id_bis/2008/nat/0/pos//w_s//intern/0/plus/1
Scraping for the season 2008 and league serie-a
https://www.transfermarkt.us/serie-a/einnahmenausgaben/wettbewerb/IT1/ids/a/sa//saison_id/2008/saison_id_bis/2008/nat/0/pos//w_s//intern/0/plus/1
Scraping for the season 2008 and league primeira-liga
https://www.transfermarkt.com/liga-portugal/einnahmenausgaben/wettbewerb/PO1/ids/a/sa//saison_id/2008/saison_id_bis/2008/nat/0/pos//w