# Scraping data from Transfermarkt
The following section outlines the steps taken to scrape all necessary data from the Transfermarkt website.

###Import all dependencies
We must first import the necessary Python packages and libraries needed for the web scraping process.

In [None]:
import re
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np

pd.set_option('notebook_repr_html', True)

Now let's import the necessary HTTP request parameters for scraping from Transfermarkt. 

**NOTE: This will work regardless of your computer specifications.**

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
#league = ['GB1','FR1','L1','IT1','ES1']
league = ['MLS1', 'PO1', 'NL1', 'GB2', 'BRA1', 'MEX1']
league_page = "https://www.transfermarkt.com/jumplist/startseite/wettbewerb/"

The 'league' array specifies the unique ID's for the soccer leagues that are used for GET requests on the Transfermarkt website. 

We have chosen to only include Europe's **top 5 leagues** based on [country coefficient](https://www.uefa.com/nationalassociations/uefarankings/country/#/yr/2023).

### Retrieving club and player information

Below you will find two functions to return football clubs and information on individual players within each club.

In [None]:
def get_club_details(tr_tag):
    club = tr_tag.find_all('a')[0]
    club_link = club['href']
    club_name = club['title']
    return tuple((club_link,club_name))

def get_players_club(player):
    player_link = player['href']
    player_name = player.get_text()
    return tuple((player_link,player_name,club_name))


#### Step 1: Retrieve all clubs from specified leagues in 'league' array.

In [None]:
clubs_list = []
for league_id in league:
    page = requests.get(league_page + league_id,headers = headers)
    soup = bs(page.content, 'html.parser')
    tbody_container = soup.find_all('tbody')[1]
    tr_container = tbody_container.find_all('tr')
    for tr_tag in tr_container :
        clubs_list.append(get_club_details(tr_tag))
print('All the clubs were uploaded')

All the clubs were uploaded


#### Step 2: Retrieve all basic player information for each player within the specified clubs.

In [None]:
url_site = "https://www.transfermarkt.com"
player_list = []
for club_link,club_name in clubs_list:
    page = requests.get(url_site + club_link,headers = headers)
    soup = bs(page.content, 'html.parser')
    tbody_container = soup.find_all('tbody')[1]
    players_td = tbody_container.find_all('td', {"class":"hauptlink"})
    for p in players_td[::2]:
        player = p.find_all('a')[0]
        player_list.append(get_players_club(player))
print('All the players were uploaded')


All the players were uploaded


The function below retrieves more in-depth information on each player.

In [None]:
from ast import Pass
def get_profil_detail():
    return tuple((link.split("/")[4],name,club,link.split("/")[1]))

#### Step 3: Retrieve in-depth information for each player within the specified clubs.

In [None]:
counter = 1
total = str(len(player_list))
player_details = []
for link,name,club in player_list:
      player_details.append(get_profil_detail())
      count = str(counter)
      print(f"Player {counter} out of {total} uploaded.")
      counter += 1
print("Basic player details were uploaded")

Player 1 out of 3553 uploaded.
Player 2 out of 3553 uploaded.
Player 3 out of 3553 uploaded.
Player 4 out of 3553 uploaded.
Player 5 out of 3553 uploaded.
Player 6 out of 3553 uploaded.
Player 7 out of 3553 uploaded.
Player 8 out of 3553 uploaded.
Player 9 out of 3553 uploaded.
Player 10 out of 3553 uploaded.
Player 11 out of 3553 uploaded.
Player 12 out of 3553 uploaded.
Player 13 out of 3553 uploaded.
Player 14 out of 3553 uploaded.
Player 15 out of 3553 uploaded.
Player 16 out of 3553 uploaded.
Player 17 out of 3553 uploaded.
Player 18 out of 3553 uploaded.
Player 19 out of 3553 uploaded.
Player 20 out of 3553 uploaded.
Player 21 out of 3553 uploaded.
Player 22 out of 3553 uploaded.
Player 23 out of 3553 uploaded.
Player 24 out of 3553 uploaded.
Player 25 out of 3553 uploaded.
Player 26 out of 3553 uploaded.
Player 27 out of 3553 uploaded.
Player 28 out of 3553 uploaded.
Player 29 out of 3553 uploaded.
Player 30 out of 3553 uploaded.
Player 31 out of 3553 uploaded.
Player 32 out of 

In [None]:
player_details[2856]

('863033', 'Márcio Silva', 'Coritiba Foot Ball Club', 'marcio-silva')

The function below retrieves information on past appearance and injury history.

In [None]:
def get_injuries_details(soup):

    inj_his = soup.find_all('span', {"class":"empty"})
    if inj_his:
        if (inj_his[0].get_text() == 'No entries available'):
            return []

    try:
      tbody_container = soup.find_all('tbody')[0]
    except:
      return []
      
    tr_container = tbody_container.find_all('tr')
    injuries_list = []
    for tr_tag in tr_container:
        season = tr_tag.find_all('td')[0].get_text()
        injury = tr_tag.find_all('td')[1].get_text()
        start_date = tr_tag.find_all('td')[2].get_text()
        end_date = tr_tag.find_all('td')[3].get_text()
        time_out = tr_tag.find_all('td')[4]
        games_missed = tr_tag.find_all('td')[5].get_text() 
        games_missed = 0 if games_missed == '-' else games_missed
        injuries_list.append(tuple((season,injury,start_date,end_date,time_out.get_text().split()[0],games_missed)))
    return injuries_list

#### Step 4: Retrieve past appearance and injury history for each player within the specified clubs.

In [None]:
player_list = []
total_players = len(player_details)
counter = 1
removed = 0

for Id,name,club,name_link in player_details:
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}

    page1 = requests.get("https://www.transfermarkt.com/{}/verletzungen/spieler/{}".format(name_link,Id),headers=headers)
    soup = bs(page1.content, 'html.parser')
    injury_details = get_injuries_details(soup)

    print(f'Player {counter} out of {total_players} uploaded.')
    counter+=1

    player_list.append(tuple((Id,name,club,injury_details)))

print('Player appearance and injury details were uploaded')
df_player = pd.DataFrame(player_list)
df_player.to_csv('transfermarkt_data.csv')

Player 1 out of 3553 uploaded.
Player 2 out of 3553 uploaded.
Player 3 out of 3553 uploaded.
Player 4 out of 3553 uploaded.
Player 5 out of 3553 uploaded.
Player 6 out of 3553 uploaded.
Player 7 out of 3553 uploaded.
Player 8 out of 3553 uploaded.
Player 9 out of 3553 uploaded.
Player 10 out of 3553 uploaded.
Player 11 out of 3553 uploaded.
Player 12 out of 3553 uploaded.
Player 13 out of 3553 uploaded.
Player 14 out of 3553 uploaded.
Player 15 out of 3553 uploaded.
Player 16 out of 3553 uploaded.
Player 17 out of 3553 uploaded.
Player 18 out of 3553 uploaded.
Player 19 out of 3553 uploaded.
Player 20 out of 3553 uploaded.
Player 21 out of 3553 uploaded.
Player 22 out of 3553 uploaded.
Player 23 out of 3553 uploaded.
Player 24 out of 3553 uploaded.
Player 25 out of 3553 uploaded.
Player 26 out of 3553 uploaded.
Player 27 out of 3553 uploaded.
Player 28 out of 3553 uploaded.
Player 29 out of 3553 uploaded.
Player 30 out of 3553 uploaded.
Player 31 out of 3553 uploaded.
Player 32 out of 

In [None]:
df_player = pd.DataFrame(player_list)
df_player

Unnamed: 0,0,1,2,3
0,126630,Sean Johnson,New York City FC,"[(18/19, Shoulder Injury, Aug 1, 2018, Aug 15,..."
1,644782,Luis Barraza,New York City FC,[]
2,297422,Cody Mizell,New York City FC,[]
3,277318,Thiago Martins,New York City FC,"[(21/22, Bruise, Aug 20, 2021, Aug 29, 2021, 9..."
4,147462,Alexander Callens,New York City FC,"[(18/19, Muscle Injury, Apr 1, 2019, Apr 18, 2..."
...,...,...,...,...
3548,370858,Antonio Figueroa,Querétaro FC,[]
3549,395164,José Angulo,Querétaro FC,"[(21/22, Pubitis, Oct 25, 2021, Nov 18, 2021, ..."
3550,68562,Ariel Nahuelpán,Querétaro FC,"[(20/21, Calf Injury, May 20, 2021, Jun 10, 20..."
3551,403747,Jonathan Dos Santos,Querétaro FC,"[(20/21, Torn muscle bundle, Oct 16, 2020, Nov..."


In [None]:
from google.colab import files

df_player.to_csv('transfermarkt.csv')
files.download('transfermarkt.csv')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Scraping data from FBRef
The following section outlines the steps taken to scrape all necessary data from the FBReference website.

### Import all dependencies

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import sys, getopt
import csv

#### Step 1: Define all columns that we want to extract from FBReference.

In [None]:
#standard(stats)
stats = ["player","nationality","position","team","age","birth_year","games","games_starts","minutes","cards_yellow","cards_red"]
stats3 = ["players_used","possession","games","games_starts","minutes","goals","assists","cards_yellow","cards_red"] 
#goalkeeping(keepers)
keepers = ["player","nationality","position","squad","age","birth_year"]
#shooting(shooting)
shooting = ["player","nationality","position","squad","age","birth_year","minutes_90s","shots_total","shots_total_per90"]
shooting2 = ["minutes_90s","goals","shots","shots_free_kicks"]
shooting3 = ["goals","pens_made","pens_att","shots_total","shots_free_kicks"]
#passing(passing)
passing = ["player","nationality","position","squad","age","birth_year","minutes_90s","passes_completed","passes","passes_total_distance","passes_short","passes_medium","passes_long"]
passing2 = ["passes_completed","passes","passes_total_distance","passes_progressive_distance","passes_short","passes_medium","passes_long"]
#passtypes(passing_types)
passing_types = ["player","nationality","position","squad","age","birth_year","minutes_90s","passes","passes_live","passes_dead","passes_free_kicks","through_balls","passes_switches","crosses","corner_kicks"]
passing_types2 = ["passes","passes_live","passes_dead","passes_free_kicks","through_balls","passes_switches","crosses","corner_kicks"]
#defensive actions(defense)
defense = ["player","nationality","position","squad","age","birth_year","minutes_90s","tackles","tackles_won","dribble_tackles","dribbled_past","blocks","interceptions","clearances","errors"]
defense2 = ["tackles","tackles_won","dribble_tackles","dribbled_past","blocks","interceptions","clearances","errors"]
#possession(possession)
possession = ["player","nationality","position","squad","age","birth_year","minutes_90s","touches","touches_live_ball","dribbles","miscontrols","dispossessed","passes_received"]
possession2 = ["touches","touches_live_ball","dribbles","miscontrols","dispossessed","passes_received"]
#playingtime(playingtime)
playingtime = ["player","nationality","position","squad","age","birth_year","minutes_90s","games","minutes","minutes_per_game","minutes_pct","games_starts","minutes_per_start","games_subs","minutes_per_sub","unused_subs"]
playingtime2 = ["games","minutes","minutes_per_game","minutes_pct","games_starts","minutes_per_start","games_subs","minutes_per_sub","unused_subs"]
#miscallaneous(misc)
misc = ["player","nationality","position","squad","age","birth_year","minutes_90s","cards_yellow","cards_red","fouls","fouled","ball_recoveries","aerials_won","aerials_lost"]
misc2 = ["cards_yellow","cards_red","fouls","fouled","ball_recoveries","aerials_won","aerials_lost"]

####Step 2: Define all functions needed for scraping.

In [None]:
import time
def get_tables(url):
    res = requests.get(url)
    ## The next two lines get around the issue with comments breaking the parsing.
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')
    all_tables = soup.findAll("tbody")
    player_table = all_tables[1]
    return player_table

def get_frame(features, player_table):
    pre_df_player = dict()
    features_wanted_player = features
    rows_player = player_table.find_all('tr')
    for row in rows_player:
        if(row.find('th',{"scope":"row"}) != None):
    
            for f in features_wanted_player:
                cell = row.find("td",{"data-stat": f})
                a = cell.text.strip().encode()
                text=a.decode("utf-8")
                if(text == ''):
                    text = '0'
                if((f!='player')&(f!='nationality')&(f!='position')&(f!='team')&(f!='age')&(f!='birth_year')):
                    text = float(text.replace(',',''))
                if f in pre_df_player:
                    pre_df_player[f].append(text)
                else:
                    pre_df_player[f] = [text]

                # # get height and weight
                # if (f == 'player'):
                #   time.sleep(2.5)
                #   cell_href = row.find("td",{"data-stat": f})
                #   href = cell_href.findAll('a')[0].get('href')
                #   player_url = 'https://fbref.com' + href
                #   res = requests.get(player_url)
                #   comm = re.compile("<!--|-->")
                #   soup = BeautifulSoup(comm.sub("",res.text),'lxml')
                #   div = soup.find("div", {"id": "info"})
                #   p_hw = div.findAll('p')[2]
                #   h_span = p_hw.findAll('span')[0]
                #   w_span = p_hw.findAll('span')[1]
                #   pre_df_player['height'] = h_span.get_text()
                #   pre_df_player['weight'] = w_span.get_text()

    df_player = pd.DataFrame.from_dict(pre_df_player)
    return df_player

def frame_for_category(category,top,end,features):
    url = (top + category + end)
    player_table = get_tables(url)
    df_player = get_frame(features, player_table)
    return df_player

def get_outfield_data(top, end):
    df1 = frame_for_category('stats',top,end,stats)
    df2 = frame_for_category('shooting',top,end,shooting2)
    df3 = frame_for_category('passing',top,end,passing2)
    df4 = frame_for_category('passing_types',top,end,passing_types2)
    df6 = frame_for_category('defense',top,end,defense2)
    df7 = frame_for_category('possession',top,end,possession2)
    df8 = frame_for_category('misc',top,end,misc2)
    df = pd.concat([df1, df2, df3, df4, df6, df7, df8], axis=1)
    df = df.loc[:,~df.columns.duplicated()]
    return df


#### Step 3: Run scraper with corresponding URL for up-to-date player data.

In [None]:
link_arr = [['https://fbref.com/en/comps/22/','/Major-League-Soccer-Stats'],
            ['https://fbref.com/en/comps/32/','/Primeira-Liga-Stats'],
            ['https://fbref.com/en/comps/23/','/Eredivisie-Stats'],
            ['https://fbref.com/en/comps/10/','/Championship-Stats'],
            ['https://fbref.com/en/comps/24/','/Serie-A-Stats'],
            ['https://fbref.com/en/comps/31/','/Liga-MX-Stats']]


for link in link_arr:
  df_outfield = get_outfield_data(link[0],link[1])

df_outfield.to_csv('fbrefdata2022.csv', sep=';')
print(df_outfield.head())

<!DOCTYPE html>
<html><body><p>[if lt IE 7]&gt;  &lt;![endif]
[if IE 7]&gt;     &lt;![endif]
[if IE 8]&gt;     &lt;![endif]
[if gt IE 8]&gt;&gt;  &lt;![endif]
</p>
<title>Access denied | fbref.com used Cloudflare to restrict access</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="noindex, nofollow" name="robots"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<link href="/cdn-cgi/styles/main.css" id="cf_styles-css" rel="stylesheet"/>
<script>
(function(){if(document.addEventListener&&window.XMLHttpRequest&&JSON&&JSON.stringify){var e=function(a){var c=document.getElementById("error-feedback-survey"),d=document.getElementById("error-feedback-success"),b=new XMLHttpRequest;a={event:"feedback clicked",properties:{errorCode:1015,helpful:a,version:1}};b.open("POST","https://sparrow.cloudflare.com/api/v1/event");b.setRequestHeader("Content-Type","ap

IndexError: ignored

# Creating final dataframe

### Importing data
Rather than having to constantly re-run the scraper (which takes about an hour), we have created a download link that allows us to directly import the final Transfermarkt and FBref dataframe.


In [None]:
import pandas as pd
df_injury=pd.read_csv('https://docs.google.com/spreadsheets/d/1TwWSYiSJviNgr16MH7TLhf53hKHkPhKD1gn261zhJOU/export?format=csv&gid=2070040166')
df_outfield=pd.read_csv('https://docs.google.com/spreadsheets/d/17TDSYYY4TEAEHPxTPP87Pn4WKlI2UGChtnRmozz887s/export?format=csv&gid=1112887342', sep=';')
df_injury.head()

Unnamed: 0,name,club,injuries
0,Ederson,Manchester City,"[('20/21', 'Virus Infection', 'Dec 27, 2020', ..."
1,Stefan Ortega,Manchester City,"[('21/22', 'Corona virus', 'Dec 17, 2021', 'De..."
2,Scott Carson,Manchester City,"[('20/21', 'Virus Infection', 'Jan 6, 2021', '..."
3,Rúben Dias,Manchester City,"[('21/22', 'Knock', 'Mar 3, 2022', 'Apr 11, 20..."
4,Aymeric Laporte,Manchester City,"[('22/23', 'Knee Surgery', 'Jun 30, 2022', 'Oc..."


### Dropping duplicates

In [None]:
df_outfield = df_outfield.drop_duplicates(subset='player')
df_injury = df_injury.drop_duplicates(subset='name')
print(df_outfield['player'].value_counts())
print(df_injury['name'].value_counts())

### Merging dataframes

We must now merge the Transfermarkt dataframe (containing the injury data) with the FBRef dataframe (containing in-game statistics).

In [None]:
df_outfield = df_outfield.rename(columns={"player": "name", "team": "club"})
df_injury = pd.merge(df_injury, df_outfield, how='inner', on='name')
df_injury

Unnamed: 0,age,games,games_starts,minutes,cards_yellow,cards_red,minutes_90s,goals,shots,shots_free_kicks,...,ball_recoveries,aerials_won,aerials_lost,n_injuries,n_severe_injuries,currently_injured,position_DF,position_FW,position_GK,position_MF
0,29.273973,14.0,14.0,1260.0,0.0,0.0,14.0,0.0,0.0,0.0,...,19.0,3.0,0.0,6,0,0,0,0,1,0
1,25.534247,12.0,9.0,873.0,0.0,0.0,9.7,0.0,5.0,0.0,...,41.0,26.0,13.0,4,0,0,1,0,0,0
2,28.498630,4.0,3.0,286.0,0.0,0.0,3.2,0.0,7.0,0.0,...,11.0,11.0,4.0,14,4,0,1,0,0,0
3,28.495890,9.0,8.0,731.0,1.0,0.0,8.1,0.0,5.0,0.0,...,37.0,10.0,6.0,15,1,0,1,0,0,0
4,27.767123,8.0,7.0,571.0,0.0,0.0,6.3,0.0,3.0,0.0,...,24.0,13.0,10.0,10,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1969,23.594521,8.0,8.0,514.0,0.0,0.0,5.7,1.0,11.0,0.0,...,23.0,4.0,5.0,3,1,0,0,1,0,1
1970,32.978082,12.0,3.0,392.0,3.0,0.0,4.4,1.0,2.0,0.0,...,16.0,3.0,3.0,2,0,0,1,0,0,1
1971,26.739726,12.0,9.0,730.0,2.0,1.0,8.1,1.0,9.0,0.0,...,27.0,8.0,15.0,4,0,0,0,1,0,0
1972,25.660274,13.0,4.0,512.0,2.0,0.0,5.7,2.0,12.0,0.0,...,13.0,16.0,19.0,5,2,0,0,1,0,0


# Data Cleansing

In this section, we will outline the steps we took to clean the data obtained from Transfermarkt / FBRef and make it usable to train our model.

### Unpacking arrays

There are certain columns containing arrays of tuples (historical data) that cannot be used as features.

Therefore, we must unpack the values within these arrays to make usable features for the model.

In [None]:
import ast
import numpy as np
df_injury['injuries'] = df_injury['injuries'].apply(lambda arr: ast.literal_eval(arr)) # only run this once to transform into array (currently held as string)
df_injury['n_injuries'] = df_injury['injuries'].apply(lambda arr: len(arr)) # total number of injuries suffered
df_injury['n_severe_injuries'] = df_injury['injuries'].apply(lambda arr: sum([1 for x in arr if int(x[4]) > 60])) # total number of severe injuries suffered (defined as 60 days missed or greater)

### Target column

In the cell below we create our target variable column.

In [None]:
from datetime import datetime
df_injury['currently_injured'] = df_injury['injuries'].apply(lambda arr: sum([1 for x in arr if x[3] == '-' or datetime.strptime(x[3], "%b %d, %Y") > datetime.now()])) # currently injured players

### Datatype conversions, non-usable features and dummy variables

In [None]:
# Players who have not stepped foot onto the field, will have 0s for 90% of columns
# We will remove these rows from the dataset (total of 24, 1 injured)
df_injury = df_injury[df_injury['minutes_90s'] != 0]

# Drop non usable features (categorical/ non numerical)
colums_to_drop = ["name", "club_x", "club_y", "Unnamed: 0", "birth_year", "nationality", "injuries"]
df_injury.drop(columns = colums_to_drop, inplace = True)

# Convert age to float64 column
df_injury['age'] = df_injury['age'].apply(lambda row : float(row[0:2])+(float(row[3:6])/365))

# Alternate position column creation
df_injury['alt_position'] = df_injury['position'].apply(lambda row: row.split(',')[1] if ',' in row else '') 
df_injury['position'] = df_injury['position'].apply(lambda row: row[0:2])

# Dummy variables for role data
df_injury = pd.get_dummies(df_injury, columns=['position', 'alt_position'])
df_injury['position_DF'] = df_injury['position_DF'] + df_injury['alt_position_DF']
df_injury['position_MF'] = df_injury['position_MF'] + df_injury['alt_position_MF']
df_injury['position_FW'] = df_injury['position_FW'] + df_injury['alt_position_FW']

df_injury = df_injury.rename(columns={'aerials_lost,': 'aerials_lost'})
df_injury['aerials_lost'] = df_injury['aerials_lost'].apply(lambda row: float(row.replace(',', '')))
df_injury.head()

#Drop alternate columns
colums_to_drop = ["alt_position_", "alt_position_DF", "alt_position_MF", "alt_position_FW"]
df_injury.drop(columns = colums_to_drop, inplace = True)

#Make columns on per 90 minutes basis
season_tot_cols = list(df_injury.columns[7:43])
season_tot_cols.extend(['cards_yellow', 'cards_red'])

for col in season_tot_cols:
    df_injury[col] = np.where(df_injury['minutes_90s'] != 0, df_injury[col] / df_injury['minutes_90s'], 0)

# engineered?
df_injury['minutes_per_appearance'] = df_injury['minutes'] / df_injury['games']
df_injury['game_starts_percent'] = df_injury['games_starts'] / df_injury['games']

#Drop seasonal cumulative columns
colums_to_drop = ["games", "games_starts", "minutes", "minutes_90s"]
df_injury.drop(columns = colums_to_drop, inplace = True)

In [None]:
#final data set columns
df_injury.dtypes

age                            float64
games                          float64
games_starts                   float64
minutes                        float64
cards_yellow                   float64
cards_red                      float64
minutes_90s                    float64
goals                          float64
shots                          float64
shots_free_kicks               float64
passes_completed               float64
passes                         float64
passes_total_distance          float64
passes_progressive_distance    float64
passes_short                   float64
passes_medium                  float64
passes_long                    float64
passes_live                    float64
passes_dead                    float64
passes_free_kicks              float64
through_balls                  float64
passes_switches                float64
crosses                        float64
corner_kicks                   float64
tackles                        float64
tackles_won              