# Scraping data from Transfermarkt
The following section outlines the steps taken to scrape all necessary data from the Transfermarkt website.

### Import all dependencies
We must first import the necessary Python packages and libraries needed for the web scraping process.

In [1]:
import re
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

pd.set_option('notebook_repr_html', True)

Now let's import the necessary HTTP request parameters for scraping from Transfermarkt. 

**NOTE: This will work regardless of your computer specifications.**

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
league = ['GB1','FR1','L1','IT1','ES1']
#league = ['MLS1', 'PO1', 'NL1', 'GB2', 'BRA1', 'MEX1']
league_page = "https://www.transfermarkt.com/jumplist/startseite/wettbewerb/"

The 'league' array specifies the unique ID's for the soccer leagues that are used for GET requests on the Transfermarkt website. 

We have chosen to only include Europe's **top 5 leagues** based on [country coefficient](https://www.uefa.com/nationalassociations/uefarankings/country/#/yr/2023).

### Retrieving club and player information

Below you will find two functions to return football clubs and information on individual players within each club.

In [None]:
def get_club_details(tr_tag):
    club = tr_tag.find_all('a')[0]
    club_link = club['href']
    club_name = club['title']
    return tuple((club_link, club_name))

def get_players_club(player):
    player_link = player['href']
    player_name = player.get_text()
    return tuple((player_link, player_name, club_name))

#### Step 1: Retrieve all clubs from specified leagues in 'league' array.

In [None]:
clubs_list = []
for league_id in league:
    page = requests.get(league_page + league_id,headers = headers)
    soup = bs(page.content, 'html.parser')
    tbody_container = soup.find_all('tbody')[1]
    tr_container = tbody_container.find_all('tr')
    for tr_tag in tr_container :
        clubs_list.append(get_club_details(tr_tag))
print('All the clubs were uploaded')

All the clubs were uploaded


#### Step 2: Retrieve all basic player information for each player within the specified clubs.

In [None]:
url_site = "https://www.transfermarkt.com"
player_list = []
for club_link,club_name in clubs_list:
    page = requests.get(url_site + club_link,headers = headers)
    soup = bs(page.content, 'html.parser')
    tbody_container = soup.find_all('tbody')[1]
    players_td = tbody_container.find_all('td', {"class":"hauptlink"})
    for p in players_td[::2]:
        player = p.find_all('a')[0]
        player_list.append(get_players_club(player))
print('All the players were uploaded')

All the players were uploaded


The function below retrieves more in-depth information on each player.

In [None]:
from ast import Pass
def get_profil_detail():
    return tuple((link.split("/")[4],name,club,link.split("/")[1]))

#### Step 3: Retrieve in-depth information for each player within the specified clubs.

In [None]:
counter = 1
total = str(len(player_list))
player_details = []
for link,name,club in player_list:
      player_details.append(get_profil_detail())
      count = str(counter)
      print(f"Player {counter} out of {total} uploaded.")
      counter += 1
print("Basic player details were uploaded")

In [None]:
player_details[2856]

('863033', 'Márcio Silva', 'Coritiba Foot Ball Club', 'marcio-silva')

The function below retrieves information on past appearance and injury history.

In [None]:
def get_injuries_details(soup):

    inj_his = soup.find_all('span', {"class":"empty"})
    if inj_his:
        if (inj_his[0].get_text() == 'No entries available'):
            return []
    try:
      tbody_container = soup.find_all('tbody')[0]
    except:
      return []
      
    tr_container = tbody_container.find_all('tr')
    injuries_list = []
    for tr_tag in tr_container:
        season = tr_tag.find_all('td')[0].get_text()
        injury = tr_tag.find_all('td')[1].get_text()
        start_date = tr_tag.find_all('td')[2].get_text()
        end_date = tr_tag.find_all('td')[3].get_text()
        time_out = tr_tag.find_all('td')[4]
        games_missed = tr_tag.find_all('td')[5].get_text() 
        games_missed = 0 if games_missed == '-' else games_missed
        injuries_list.append(tuple((season,injury,start_date,end_date,time_out.get_text().split()[0],games_missed)))
    return injuries_list

#### Step 4: Retrieve past appearance and injury history for each player within the specified clubs.

In [None]:
player_list = []
total_players = len(player_details)
counter = 1
removed = 0

for Id,name,club,name_link in player_details:
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}

    page1 = requests.get("https://www.transfermarkt.com/{}/verletzungen/spieler/{}".format(name_link,Id),headers=headers)
    soup = bs(page1.content, 'html.parser')
    injury_details = get_injuries_details(soup)

    print(f'Player {counter} out of {total_players} uploaded.')
    counter+=1

    player_list.append(tuple((Id,name,club,injury_details)))

print('Player appearance and injury details were uploaded')
df_player = pd.DataFrame(player_list)
df_player.to_csv('transfermarkt_data_2022.csv')

In [None]:
df_player = pd.DataFrame(player_list)
df_player

Unnamed: 0,0,1,2,3
0,126630,Sean Johnson,New York City FC,"[(18/19, Shoulder Injury, Aug 1, 2018, Aug 15,..."
1,644782,Luis Barraza,New York City FC,[]
2,297422,Cody Mizell,New York City FC,[]
3,277318,Thiago Martins,New York City FC,"[(21/22, Bruise, Aug 20, 2021, Aug 29, 2021, 9..."
4,147462,Alexander Callens,New York City FC,"[(18/19, Muscle Injury, Apr 1, 2019, Apr 18, 2..."
...,...,...,...,...
3548,370858,Antonio Figueroa,Querétaro FC,[]
3549,395164,José Angulo,Querétaro FC,"[(21/22, Pubitis, Oct 25, 2021, Nov 18, 2021, ..."
3550,68562,Ariel Nahuelpán,Querétaro FC,"[(20/21, Calf Injury, May 20, 2021, Jun 10, 20..."
3551,403747,Jonathan Dos Santos,Querétaro FC,"[(20/21, Torn muscle bundle, Oct 16, 2020, Nov..."


# Scraping data from FBRef
The following section outlines the steps taken to scrape all necessary data from the FBReference website.

#### Step 1: Define all columns that we want to extract from FBReference.

In [None]:
#standard(stats)
stats = ["player","nationality","position","team","age","birth_year","games","games_starts","minutes","cards_yellow","cards_red"]
stats3 = ["players_used","possession","games","games_starts","minutes","goals","assists","cards_yellow","cards_red"] 
#goalkeeping(keepers)
keepers = ["player","nationality","position","squad","age","birth_year"]
#shooting(shooting)
shooting = ["player","nationality","position","squad","age","birth_year","minutes_90s","shots_total","shots_total_per90"]
shooting2 = ["minutes_90s","goals","shots","shots_free_kicks"]
shooting3 = ["goals","pens_made","pens_att","shots_total","shots_free_kicks"]
#passing(passing)
passing = ["player","nationality","position","squad","age","birth_year","minutes_90s","passes_completed","passes","passes_total_distance","passes_short","passes_medium","passes_long"]
passing2 = ["passes_completed","passes","passes_total_distance","passes_progressive_distance","passes_short","passes_medium","passes_long"]
#passtypes(passing_types)
passing_types = ["player","nationality","position","squad","age","birth_year","minutes_90s","passes","passes_live","passes_dead","passes_free_kicks","through_balls","passes_switches","crosses","corner_kicks"]
passing_types2 = ["passes","passes_live","passes_dead","passes_free_kicks","through_balls","passes_switches","crosses","corner_kicks"]
#defensive actions(defense)
defense = ["player","nationality","position","squad","age","birth_year","minutes_90s","tackles","tackles_won","dribble_tackles","dribbled_past","blocks","interceptions","clearances","errors"]
defense2 = ["tackles","tackles_won","dribble_tackles","dribbled_past","blocks","interceptions","clearances","errors"]
#possession(possession)
possession = ["player","nationality","position","squad","age","birth_year","minutes_90s","touches","touches_live_ball","dribbles","miscontrols","dispossessed","passes_received"]
possession2 = ["touches","touches_live_ball","dribbles","miscontrols","dispossessed","passes_received"]
#playingtime(playingtime)
playingtime = ["player","nationality","position","squad","age","birth_year","minutes_90s","games","minutes","minutes_per_game","minutes_pct","games_starts","minutes_per_start","games_subs","minutes_per_sub","unused_subs"]
playingtime2 = ["games","minutes","minutes_per_game","minutes_pct","games_starts","minutes_per_start","games_subs","minutes_per_sub","unused_subs"]
#miscallaneous(misc)
misc = ["player","nationality","position","squad","age","birth_year","minutes_90s","cards_yellow","cards_red","fouls","fouled","ball_recoveries","aerials_won","aerials_lost"]
misc2 = ["cards_yellow","cards_red","fouls","fouled","ball_recoveries","aerials_won","aerials_lost"]

#### Step 2: Define all functions needed for scraping.

In [None]:
import time
def get_tables(url):
    res = requests.get(url)
    ## The next two lines get around the issue with comments breaking the parsing.
    comm = re.compile("<!--|-->")
    soup = bs(comm.sub("",res.text),'lxml')
    all_tables = soup.findAll("tbody")
    player_table = all_tables[1]
    return player_table

def get_frame(features, player_table):
    pre_df_player = dict()
    features_wanted_player = features
    rows_player = player_table.find_all('tr')
    for row in rows_player:
        if(row.find('th',{"scope":"row"}) != None):
    
            for f in features_wanted_player:
                cell = row.find("td",{"data-stat": f})
                a = cell.text.strip().encode()
                text=a.decode("utf-8")
                if(text == ''):
                    text = '0'
                if((f!='player')&(f!='nationality')&(f!='position')&(f!='team')&(f!='age')&(f!='birth_year')):
                    text = float(text.replace(',',''))
                if f in pre_df_player:
                    pre_df_player[f].append(text)
                else:
                    pre_df_player[f] = [text]

                # # get height and weight
                # if (f == 'player'):
                #   time.sleep(2.5)
                #   cell_href = row.find("td",{"data-stat": f})
                #   href = cell_href.findAll('a')[0].get('href')
                #   player_url = 'https://fbref.com' + href
                #   res = requests.get(player_url)
                #   comm = re.compile("<!--|-->")
                #   soup = BeautifulSoup(comm.sub("",res.text),'lxml')
                #   div = soup.find("div", {"id": "info"})
                #   p_hw = div.findAll('p')[2]
                #   h_span = p_hw.findAll('span')[0]
                #   w_span = p_hw.findAll('span')[1]
                #   pre_df_player['height'] = h_span.get_text()
                #   pre_df_player['weight'] = w_span.get_text()

    df_player = pd.DataFrame.from_dict(pre_df_player)
    return df_player

def frame_for_category(category,top,end,features):
    url = (top + category + end)
    player_table = get_tables(url)
    df_player = get_frame(features, player_table)
    return df_player

def get_outfield_data(top, end):
    df1 = frame_for_category('stats',top,end,stats)
    df2 = frame_for_category('shooting',top,end,shooting2)
    df3 = frame_for_category('passing',top,end,passing2)
    df4 = frame_for_category('passing_types',top,end,passing_types2)
    df6 = frame_for_category('defense',top,end,defense2)
    df7 = frame_for_category('possession',top,end,possession2)
    df8 = frame_for_category('misc',top,end,misc2)
    df = pd.concat([df1, df2, df3, df4, df6, df7, df8], axis=1)
    df = df.loc[:,~df.columns.duplicated()]
    return df


#### Step 3: Run scraper with corresponding URL for up-to-date player data.

In [None]:
link_arr = [['https://fbref.com/en/comps/22/','/Major-League-Soccer-Stats'],
            ['https://fbref.com/en/comps/32/','/Primeira-Liga-Stats'],
            ['https://fbref.com/en/comps/23/','/Eredivisie-Stats'],
            ['https://fbref.com/en/comps/10/','/Championship-Stats'],
            ['https://fbref.com/en/comps/24/','/Serie-A-Stats'],
            ['https://fbref.com/en/comps/31/','/Liga-MX-Stats']]


for link in link_arr:
  df_outfield = get_outfield_data(link[0],link[1])

df_outfield.to_csv('fbref_data_2022.csv', sep=';')