## Metis Project 2 - Predicting Market Values of Soccer Players in the Top 5 European Leagues Using Linear Regression

### Creating a Dataframe of Player Stats (Features):

#### 1. Standard Stats - League, Position, Age, Matches Played, Starts, Goals, Assists, Yellow Cards, Red Cards
#### 2. Shooting Stats - Total Shots, Shots on Target
#### 3. Passing Stats - Passes Completed, Passes Attempted, Key Passes (passes that directly lead to a shot), Completed Passes into Penalty Area
#### 4. Possession Stats - Number of Players Dribbled Past, Dribbles into Penalty Area


In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import time, os
import requests

<u> 2017-2018 Stats URLs </u>

Standard Stats URL: https://fbref.com/en/comps/Big5/2017-2018/stats/players/2017-2018-Big-5-European-Leagues-Stats  
Shooting Stats URL: https://fbref.com/en/comps/Big5/2017-2018/shooting/players/2017-2018-Big-5-European-Leagues-Stats  
Passing Stats URL: https://fbref.com/en/comps/Big5/2017-2018/passing/players/2017-2018-Big-5-European-Leagues-Stats  
Goal & Shot Creating Stats URL: https://fbref.com/en/comps/Big5/2017-2018/gca/players/2017-2018-Big-5-European-Leagues-Stats  
Possession Stats URL: https://fbref.com/en/comps/Big5/2017-2018/possession/players/2017-2018-Big-5-European-Leagues-Stats

### 1. Standard Stats - League, Position, Age, Matches Played, Starts, Goals, Assists, Yellow Cards, Red Cards

In [2]:
def standard_table(URL):
    '''
    input: URL of FBREF website containing soccer player stats
    output: dataframe of the standard stats for all senior players in the top 5 European leagues 
    '''

    # URL html for market value table read in beautiful soup
    page = requests.get(URL).text
    soup = BeautifulSoup(page)
    
    # Finding standard stats table
    standard_table = soup.find('table', id='stats_standard')

    # Put player names in list
    players = [header for header in standard_table.find_all('td', {'class':'left','data-stat':'player'})]
    player_names = [name.text for th in players for name in th.find_all('a')]

    # Put player stats in list
    player_row = [row for row in standard_table.find_all('tr', class_= lambda x: x != 'thead')]

    player_stats = {}

    for player in player_row[2:]: 
        player_stats_list = []
        
        player_stats_list.append(player.find('td',{'data-stat':'comp_level'}).text)        
        player_stats_list.append(player.find('td',{'data-stat':'position'}).text)
        player_stats_list.append(player.find('td',{'data-stat':'age'}).text)
        player_stats_list.append(int(player.find('td',{'data-stat':'games'}).text))
        player_stats_list.append(int(player.find('td',{'data-stat':'games_starts'}).text))
        player_stats_list.append(float(player.find('td',{'data-stat':'minutes_90s'}).text))
        player_stats_list.append(int(player.find('td',{'data-stat':'goals'}).text))
        player_stats_list.append(int(player.find('td',{'data-stat':'assists'}).text))
        player_stats_list.append(int(player.find('td',{'data-stat':'cards_yellow'}).text))
        player_stats_list.append(int(player.find('td',{'data-stat':'cards_red'}).text))
        
        player_stats[player] = player_stats_list
        
    player_stats_list_all = [stats for stats in player_stats.values()]

    # Creating dataframe for players and stats
    player_stats_df = pd.DataFrame(player_stats_list_all)
    player_stats_df.columns = ['League', 'Position', 'Age', 'Matches Played', 'Starts', '90s Played',
       'Goals', 'Assists', 'Yellow Cards', 'Red Cards']
    player_stats_df.index = player_names

    return player_stats_df

In [3]:
# def standard_table(URL):
#     '''
#     input: URL of FBREF website containing soccer player stats
#     output: dataframe of the standard stats for all senior players in the top 5 European leagues 
#     '''

#     # URL html for market value table read in beautiful soup
#     page = requests.get(URL).text
#     soup = BeautifulSoup(page)
    
#     # Finding standard stats table
#     standard_table = soup.find('table', id='stats_standard')

#     # Put player names in list
#     players = [header for header in standard_table.find_all('td', {'class':'left','data-stat':'player'})]
#     player_names = [name.text for th in players for name in th.find_all('a')]

#     # Put player stats in list
#     player_row = [row for row in standard_table.find_all('tr', class_= lambda x: x != 'thead')]

#     # Organize each player's stats in a dictionary form and place them into a master dictionary (nested dictionaries)
#     player_stats = {}

#     for player in player_row[2:]: 
#         player_stats_columns = {}
        
#         player_stats_columns['League'] = player.find('td',{'data-stat':'comp_level'}).text        
#         player_stats_columns['Position'] = player.find('td',{'data-stat':'position'}).text
#         player_stats_columns['Age'] = player.find('td',{'data-stat':'age'}).text
#         player_stats_columns['Matches Played'] = int(player.find('td',{'data-stat':'games'}).text)
#         player_stats_columns['Starts'] = int(player.find('td',{'data-stat':'games_starts'}).text)
#         player_stats_columns['90s Played'] = float(player.find('td',{'data-stat':'minutes_90s'}).text)
#         player_stats_columns['Goals'] = int(player.find('td',{'data-stat':'goals'}).text)
#         player_stats_columns['Assists'] = int(player.find('td',{'data-stat':'assists'}).text)
#         player_stats_columns['Yellow Cards'] = int(player.find('td',{'data-stat':'cards_yellow'}).text)
#         player_stats_columns['Red Cards'] = int(player.find('td',{'data-stat':'cards_red'}).text)
        
#         player_name = player.find('td', {'class':'left','data-stat':'player'}).text
#         player_stats[player_name] = player_stats_columns

#     # Creating dataframe for players and stats
#     player_stats_df = pd.DataFrame(player_stats).T

#     return player_stats_df

In [4]:
player_stats_stand_17_18 = standard_table('https://fbref.com/en/comps/Big5/2017-2018/stats/players/2017-2018-Big-5-European-Leagues-Stats')

In [5]:
player_stats_stand_17_18.loc[['Kylian Mbappé']]

Unnamed: 0,League,Position,Age,Matches Played,Starts,90s Played,Goals,Assists,Yellow Cards,Red Cards
Kylian Mbappé,fr Ligue 1,FW,18,27,24,23.3,13,9,2,0
Kylian Mbappé,fr Ligue 1,"FW,MF",18,1,1,0.8,0,0,0,0


In [6]:
def convert_league(string):
    if len(string) >= 1:
        return ' '.join(string.split(' ')[1:])
    else:
        return np.nan

def convert_int(string):
    if len(string) >= 1:
        return int(string)
    else:
        return np.nan

In [7]:
player_stats_stand_17_18['League'] = player_stats_stand_17_18['League'].apply(convert_league)
player_stats_stand_17_18['Age'] = player_stats_stand_17_18['Age'].apply(convert_int)
player_stats_stand_17_18.dropna(inplace=True) # only one player with NaN for Age so simply drop those rows

In [8]:
player_stats_stand_17_18.columns

Index(['League', 'Position', 'Age', 'Matches Played', 'Starts', '90s Played',
       'Goals', 'Assists', 'Yellow Cards', 'Red Cards'],
      dtype='object')

In [9]:
player_stats_stand_17_18.reset_index(inplace=True) 
player_stats_stand_17_18.columns = ['Player Name', 'League', 'Position', 'Age', 'Matches Played', 'Starts',
       '90s Played', 'Goals', 'Assists', 'Yellow Cards', 'Red Cards']
player_stats_stand_17_18

Unnamed: 0,Player Name,League,Position,Age,Matches Played,Starts,90s Played,Goals,Assists,Yellow Cards,Red Cards
0,Patrick van Aanholt,Premier League,DF,26.0,28,25,24.3,5,1,7,0
1,Rolando Aarons,Serie A,"MF,FW",21.0,11,6,5.7,0,0,0,0
2,Rolando Aarons,Premier League,"FW,MF",21.0,4,1,1.5,0,0,0,0
3,Ignazio Abate,Serie A,DF,30.0,17,11,11.7,1,0,3,0
4,Aymen Abdennour,Ligue 1,DF,27.0,8,6,5.6,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...
2681,Robert Žulj,Bundesliga,"MF,DF",25.0,5,1,2.3,0,0,1,0
2682,Bongani Zungu,Ligue 1,MF,24.0,26,24,23.2,1,0,5,0
2683,David Zurutuza,La Liga,MF,31.0,31,26,24.8,0,2,2,0
2684,Filip Đuričić,Serie A,"FW,MF",25.0,15,13,10.8,0,1,1,0


### 1.1 Create the same standard stats dataframes for 2018-2019, 2019-2020, and 2020-2021 seasons

In [10]:
player_stats_stand_18_19 = standard_table('https://fbref.com/en/comps/Big5/2018-2019/stats/players/2018-2019-Big-5-European-Leagues-Stats')

In [11]:
player_stats_stand_18_19['League'] = player_stats_stand_18_19['League'].apply(convert_league)
player_stats_stand_18_19['Age'] = player_stats_stand_18_19['Age'].apply(convert_int)
player_stats_stand_18_19.dropna(inplace=True) # only one player with NaN for Age so simply drop those rows

In [12]:
player_stats_stand_18_19.reset_index(inplace=True)
player_stats_stand_18_19.columns = ['Player Name', 'League', 'Position', 'Age', 'Matches Played', 'Starts',
       '90s Played', 'Goals', 'Assists', 'Yellow Cards', 'Red Cards']
player_stats_stand_18_19

Unnamed: 0,Player Name,League,Position,Age,Matches Played,Starts,90s Played,Goals,Assists,Yellow Cards,Red Cards
0,Ismael Aaneba,Ligue 1,DF,19.0,3,3,2.3,0,0,3,0
1,Patrick van Aanholt,Premier League,DF,27.0,36,36,35.5,3,2,3,0
2,Ignazio Abate,Serie A,DF,31.0,19,15,14.5,0,0,4,0
3,Issah Abbas,Bundesliga,FW,19.0,1,0,0.1,0,0,0,0
4,Yunis Abdelhamid,Ligue 1,DF,30.0,38,38,38.0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...
2652,Ervin Zukanović,Serie A,DF,31.0,25,22,21.8,0,0,5,0
2653,Bongani Zungu,Ligue 1,MF,25.0,5,3,2.6,0,0,2,0
2654,David Zurutuza,La Liga,"MF,FW",32.0,21,13,13.6,2,0,2,0
2655,Filip Đorđević,Serie A,FW,30.0,13,6,7.0,1,1,1,0


In [13]:
player_stats_stand_19_20 = standard_table('https://fbref.com/en/comps/Big5/2019-2020/stats/players/2019-2020-Big-5-European-Leagues-Stats')

In [14]:
player_stats_stand_19_20['League'] = player_stats_stand_19_20['League'].apply(convert_league)
player_stats_stand_19_20['Age'] = player_stats_stand_19_20['Age'].apply(convert_int)

In [15]:
player_stats_stand_19_20[player_stats_stand_19_20['Age'].isnull()]

Unnamed: 0,League,Position,Age,Matches Played,Starts,90s Played,Goals,Assists,Yellow Cards,Red Cards
Miguel Atienza,La Liga,MF,,3,0,0.3,0,0,0,0
Andrea Ghezzi,Serie A,"DF,MF",,5,0,0.4,0,0,0,0
Jordan Thomas,Premier League,FW,,1,0,0.0,0,0,0,0
Luke Thomas,Premier League,DF,,3,3,3.0,0,1,0,0
Stefano Turati,Serie A,GK,,2,2,2.0,0,0,0,0


In [16]:
player_stats_stand_19_20.dropna(inplace=True) # only two forwards/midfielders with NaN for Age so simply drop those rows

In [17]:
player_stats_stand_19_20.reset_index(inplace=True)
player_stats_stand_19_20.columns = ['Player Name', 'League', 'Position', 'Age', 'Matches Played', 'Starts',
       '90s Played', 'Goals', 'Assists', 'Yellow Cards', 'Red Cards']
player_stats_stand_19_20

Unnamed: 0,Player Name,League,Position,Age,Matches Played,Starts,90s Played,Goals,Assists,Yellow Cards,Red Cards
0,Patrick van Aanholt,Premier League,DF,28.0,29,29,27.9,3,2,0,0
1,Max Aarons,Premier League,DF,19.0,36,36,36.0,0,1,7,0
2,Yunis Abdelhamid,Ligue 1,DF,31.0,28,28,28.0,3,0,0,0
3,Suleiman Abdullahi,Bundesliga,"FW,MF",22.0,6,1,1.7,1,0,1,0
4,Mehdi Abeid,Ligue 1,MF,26.0,25,20,20.2,1,2,5,0
...,...,...,...,...,...,...,...,...,...,...,...
2722,Bongani Zungu,Ligue 1,MF,26.0,21,10,11.0,1,0,4,0
2723,Szymon Żurkowski,Serie A,MF,21.0,2,0,0.1,0,0,0,0
2724,David Zurutuza,La Liga,MF,33.0,5,2,1.7,0,0,0,0
2725,Martin Ødegaard,La Liga,MF,20.0,31,29,28.1,4,6,3,0


In [18]:
player_stats_stand_20_21 = standard_table('https://fbref.com/en/comps/Big5/2020-2021/stats/players/2010-2021-Big-5-European-Leagues-Stats')

In [19]:
player_stats_stand_20_21['League'] = player_stats_stand_20_21['League'].apply(convert_league)
player_stats_stand_20_21['Age'] = player_stats_stand_20_21['Age'].apply(lambda string: int(string[0:2]) if len(string[0:2])==2 else np.nan)

In [20]:
player_stats_stand_20_21[player_stats_stand_20_21['Age'].isnull()]

Unnamed: 0,League,Position,Age,Matches Played,Starts,90s Played,Goals,Assists,Yellow Cards,Red Cards
Miguel Atienza,La Liga,"MF,FW",,2,1,1.2,0,0,1,0
Jonatan Carmona Alamo,La Liga,MF,,1,1,0.8,0,0,0,0
Eren Dinkci,Bundesliga,FW,,1,0,0.1,1,0,0,0
Dani Plomer,La Liga,FW,,1,0,0.5,0,0,0,0
Luke Thomas,Premier League,DF,,4,4,3.0,0,0,1,0


In [21]:
player_stats_stand_20_21.dropna(inplace=True)

In [22]:
player_stats_stand_20_21.reset_index(inplace=True)
player_stats_stand_20_21.columns = ['Player Name', 'League', 'Position', 'Age', 'Matches Played', 'Starts',
       '90s Played', 'Goals', 'Assists', 'Yellow Cards', 'Red Cards']
player_stats_stand_20_21

Unnamed: 0,Player Name,League,Position,Age,Matches Played,Starts,90s Played,Goals,Assists,Yellow Cards,Red Cards
0,Ismael Aaneba,Ligue 1,"FW,DF",21.0,2,0,0.1,0,0,0,0
1,Patrick van Aanholt,Premier League,DF,30.0,10,9,9.2,0,1,1,0
2,Issah Abbas,Bundesliga,"DF,FW",22.0,2,0,0.2,0,0,0,0
3,Yunis Abdelhamid,Ligue 1,DF,33.0,19,19,19.0,2,0,1,0
4,Mehdi Abeid,Ligue 1,MF,28.0,17,14,13.7,0,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...
2489,Igor Zubeldia,La Liga,DF,23.0,12,10,10.7,0,0,3,0
2490,Steven Zuber,Bundesliga,DF,29.0,9,5,5.0,0,1,1,0
2491,Martín Zubimendi,La Liga,MF,21.0,17,8,9.8,0,0,3,0
2492,Martin Ødegaard,La Liga,"MF,FW",22.0,7,3,2.6,0,0,0,0


### 1.2 Combine the dataframes and sum the numerical stats (for age, take the most recent value)

In [23]:
player_stats_stand_dup = pd.concat([player_stats_stand_17_18, player_stats_stand_18_19, player_stats_stand_19_20, player_stats_stand_20_21]).sort_values(by=['Player Name','Age'])
player_stats_stand_dup.tail(15)

Unnamed: 0,Player Name,League,Position,Age,Matches Played,Starts,90s Played,Goals,Assists,Yellow Cards,Red Cards
2295,Łukasz Skorupski,Serie A,GK,27.0,38,38,38.0,0,0,1,0
2338,Łukasz Skorupski,Serie A,GK,28.0,37,37,37.0,0,0,2,0
2166,Łukasz Skorupski,Serie A,GK,29.0,11,11,11.0,0,0,0,0
2409,Łukasz Teodorczyk,Serie A,"FW,MF",27.0,16,2,4.4,1,0,1,0
2459,Łukasz Teodorczyk,Serie A,FW,28.0,14,1,2.5,0,1,1,0
2580,Šime Vrsaljko,La Liga,"DF,MF",25.0,21,19,18.9,0,4,7,0
2549,Šime Vrsaljko,Serie A,DF,26.0,10,8,8.1,0,2,2,0
2622,Šime Vrsaljko,La Liga,DF,27.0,5,4,4.0,0,1,3,0
2399,Šime Vrsaljko,La Liga,DF,29.0,2,2,1.6,0,0,0,0
454,Žan Celar,Serie A,FW,19.0,1,0,0.1,0,0,0,0


In [24]:
player_stats_stand_dup[player_stats_stand_dup['Player Name']=='Kylian Mbappé']

Unnamed: 0,Player Name,League,Position,Age,Matches Played,Starts,90s Played,Goals,Assists,Yellow Cards,Red Cards
1606,Kylian Mbappé,Ligue 1,FW,18.0,27,24,23.3,13,9,2,0
1607,Kylian Mbappé,Ligue 1,"FW,MF",18.0,1,1,0.8,0,0,0,0
1573,Kylian Mbappé,Ligue 1,FW,19.0,29,24,26.0,33,7,5,1
1601,Kylian Mbappé,Ligue 1,FW,20.0,20,17,16.8,18,5,0,0
1489,Kylian Mbappé,Ligue 1,FW,22.0,16,12,11.9,12,5,1,0


In [25]:
player_stats_stand = player_stats_stand_dup.groupby('Player Name')[['Matches Played','Starts','90s Played','Goals','Assists','Yellow Cards','Red Cards']].sum().reset_index()
# player_stats_stand[player_stats_stand['Player Name']=='Ignazio Abate']

In [26]:
player_stats_stand_lastr = player_stats_stand_dup.drop_duplicates(subset='Player Name', keep='last').reset_index(drop=True)

In [27]:
player_stats_stand['Age'] = player_stats_stand_lastr['Age']

In [28]:
player_stats_stand['League'] = player_stats_stand_lastr['League']
player_stats_stand['Position'] = player_stats_stand_lastr['Position']

In [29]:
player_stats_stand.columns

Index(['Player Name', 'Matches Played', 'Starts', '90s Played', 'Goals',
       'Assists', 'Yellow Cards', 'Red Cards', 'Age', 'League', 'Position'],
      dtype='object')

In [30]:
player_stats_stand = pd.DataFrame(player_stats_stand, columns=['Player Name','League', 'Position', 'Age', 'Matches Played', 'Starts', '90s Played', 'Goals',
       'Assists', 'Yellow Cards', 'Red Cards'])

In [31]:
player_stats_stand[player_stats_stand['Player Name']=='Kylian Mbappé']

Unnamed: 0,Player Name,League,Position,Age,Matches Played,Starts,90s Played,Goals,Assists,Yellow Cards,Red Cards
2339,Kylian Mbappé,Ligue 1,FW,22.0,93,78,78.8,76,26,8,1


In [32]:
player_stats_stand

Unnamed: 0,Player Name,League,Position,Age,Matches Played,Starts,90s Played,Goals,Assists,Yellow Cards,Red Cards
0,Aaron Connolly,Premier League,FW,20.0,35,21,20.5,5,2,0,0
1,Aaron Cresswell,Premier League,DF,31.0,106,103,101.1,4,8,15,0
2,Aaron Hickey,Serie A,DF,18.0,10,9,8.1,0,0,3,0
3,Aaron Hunt,Bundesliga,"MF,FW",30.0,28,26,23.1,3,2,1,0
4,Aaron Lennon,Premier League,MF,32.0,61,40,40.2,1,3,7,0
...,...,...,...,...,...,...,...,...,...,...,...
4470,Łukasz Teodorczyk,Serie A,FW,28.0,30,3,6.9,1,1,2,0
4471,Šime Vrsaljko,La Liga,DF,29.0,38,33,32.6,0,7,12,0
4472,Žan Celar,Serie A,FW,19.0,1,0,0.1,0,0,0,0
4473,Žan Majer,Serie A,MF,27.0,27,19,18.5,1,1,7,0


### 1.3 Save the standard stats dataframes as .csv files

In [33]:
player_stats_stand_17_18.to_csv('../Data/player_stats_stand_17_18.csv', index=False)
player_stats_stand_18_19.to_csv('../Data/player_stats_stand_18_19.csv', index=False)
player_stats_stand_19_20.to_csv('../Data/player_stats_stand_19_20.csv', index=False)
player_stats_stand_20_21.to_csv('../Data/player_stats_stand_20_21.csv', index=False)
player_stats_stand.to_csv('../Data/player_stats_stand.csv', index=False)

### 2. Shooting Stats - Total Shots, Shots on Target

In [34]:
def shooting_table(URL):
    '''
    input: URL of FBREF website containing soccer player stats
    output: dataframe of the shooting stats for all senior players in the top 5 European leagues 
    '''

    # URL html for market value table read in beautiful soup
    page = requests.get(URL).text
    soup = BeautifulSoup(page)
    
    # Finding standard stats table
    shooting_table = soup.find('table', id='stats_shooting')

    # Put player names in list
    players = [header for header in shooting_table.find_all('td', {'class':'left','data-stat':'player'})]
    player_names = [name.text for th in players for name in th.find_all('a')]

    # Put player stats in list
    player_row = [row for row in shooting_table.find_all('tr', class_= lambda x: x != 'thead')]

    player_stats = {}

    for player in player_row[2:]: 
        player_stats_list = []
        
        player_stats_list.append(player.find('td',{'data-stat':'shots_total'}).text)        
        player_stats_list.append(int(player.find('td',{'data-stat':'shots_on_target'}).text))
        
        player_stats[player] = player_stats_list
        
    player_stats_list_all = [stats for stats in player_stats.values()]

    # Creating dataframe for players and stats
    player_stats_df = pd.DataFrame(player_stats_list_all)
    player_stats_df.columns = ['Total Shots', 'Shots on Target']
    player_stats_df.index = player_names

    return player_stats_df

In [35]:
# def shooting_table(URL):
#     '''
#     input: URL of FBREF website containing soccer player stats
#     output: dataframe of the shooting stats for all senior players in the top 5 European leagues 
#     '''

#     # URL html for market value table read in beautiful soup
#     page = requests.get(URL).text
#     soup = BeautifulSoup(page)
    
#     # Finding shooting stats table
#     shooting_table = soup.find('table', id='stats_shooting')

#     # Put player names in list
#     players = [header for header in shooting_table.find_all('td', {'class':'left','data-stat':'player'})]
#     player_names = [name.text for th in players for name in th.find_all('a')]

#     # Put player stats in list
#     player_row = [row for row in shooting_table.find_all('tr', class_= lambda x: x != 'thead')]

#     # Organize each player's stats in a dictionary form and place them into a master dictionary (nested dictionaries)
#     player_stats = {}

#     for player in player_row[2:]: 
#         player_stats_columns = {}
        
#         player_stats_columns['Total Shots'] = player.find('td',{'data-stat':'shots_total'}).text        
#         player_stats_columns['Shots on Target'] = int(player.find('td',{'data-stat':'shots_on_target'}).text)
        
#         player_name = player.find('td', {'class':'left','data-stat':'player'}).text
#         player_stats[player_name] = player_stats_columns

#     # Creating dataframe for players and stats
#     player_stats_df = pd.DataFrame(player_stats).T

#     return player_stats_df

In [36]:
player_stats_shoot_17_18 = shooting_table('https://fbref.com/en/comps/Big5/2017-2018/shooting/players/2017-2018-Big-5-European-Leagues-Stats')

In [37]:
player_stats_shoot_17_18['Total Shots'] = player_stats_shoot_17_18['Total Shots'].apply(convert_int)
player_stats_shoot_17_18.dropna(inplace=True) 

In [38]:
player_stats_shoot_17_18.reset_index(inplace=True) 
player_stats_shoot_17_18.columns = ['Player Name', 'Total Shots', 'Shots on Target']
player_stats_shoot_17_18

Unnamed: 0,Player Name,Total Shots,Shots on Target
0,Patrick van Aanholt,33.0,12
1,Rolando Aarons,2.0,0
2,Rolando Aarons,3.0,0
3,Ignazio Abate,4.0,2
4,Aymen Abdennour,2.0,1
...,...,...,...
2675,Robert Žulj,5.0,0
2676,Bongani Zungu,9.0,4
2677,David Zurutuza,17.0,4
2678,Filip Đuričić,0.0,0


### 2.1 Create the same shooting stats dataframes for 2018-2019, 2019-2020, and 2020-2021 seasons

In [39]:
player_stats_shoot_18_19 = shooting_table('https://fbref.com/en/comps/Big5/2018-2019/shooting/players/2018-2019-Big-5-European-Leagues-Stats')

AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
player_stats_shoot_18_19['Total Shots'] = player_stats_shoot_18_19['Total Shots'].apply(convert_int)

In [None]:
player_stats_shoot_18_19.reset_index(inplace=True) 
player_stats_shoot_18_19.columns = ['Player Name', 'Total Shots', 'Shots on Target']
player_stats_shoot_18_19

In [None]:
player_stats_shoot_19_20 = shooting_table('https://fbref.com/en/comps/Big5/2019-2020/shooting/players/2019-2020-Big-5-European-Leagues-Stats')

In [None]:
player_stats_shoot_19_20['Total Shots'] = player_stats_shoot_19_20['Total Shots'].apply(convert_int)

In [None]:
player_stats_shoot_19_20.reset_index(inplace=True) 
player_stats_shoot_19_20.columns = ['Player Name', 'Total Shots', 'Shots on Target']
player_stats_shoot_19_20

In [None]:
player_stats_shoot_20_21 = shooting_table('https://fbref.com/en/comps/Big5/2020-2021/shooting/players/2020-2021-Big-5-European-Leagues-Stats')

In [None]:
player_stats_shoot_20_21['Total Shots'] = player_stats_shoot_20_21['Total Shots'].apply(convert_int)

In [None]:
player_stats_shoot_20_21.reset_index(inplace=True) 
player_stats_shoot_20_21.columns = ['Player Name', 'Total Shots', 'Shots on Target']
player_stats_shoot_20_21

### 2.2 Combine the dataframes and sum the numerical stats 

In [None]:
player_stats_shoot_dup = pd.concat([player_stats_shoot_17_18, player_stats_shoot_18_19, player_stats_shoot_19_20, player_stats_shoot_20_21]).sort_values(by='Player Name')
player_stats_shoot_dup.head(15)

In [None]:
player_stats_shoot1 = player_stats_shoot_dup.groupby('Player Name')['Total Shots'].sum().reset_index()
player_stats_shoot2 = player_stats_shoot_dup.groupby('Player Name')['Shots on Target'].sum().reset_index()
player_stats_shoot = player_stats_shoot1.merge(player_stats_shoot2, on='Player Name')

In [None]:
player_stats_shoot

### 2.3 Save the shooting stats dataframes as .csv files

In [None]:
player_stats_shoot_17_18.to_csv('../Data/player_stats_shoot_17_18.csv', index=False)
player_stats_shoot_18_19.to_csv('../Data/player_stats_shoot_18_19.csv', index=False)
player_stats_shoot_19_20.to_csv('../Data/player_stats_shoot_19_20.csv', index=False)
player_stats_shoot_20_21.to_csv('../Data/player_stats_shoot_20_21.csv', index=False)
player_stats_shoot.to_csv('../Data/player_stats_shoot.csv', index=False)

### 3. Passing Stats - Passes Completed, Passes Attempted, Key Passes (passes that directly lead to a shot), Completed Passes into Penalty Area

In [None]:
def passing_table(URL):
    '''
    input: URL of FBREF website containing soccer player stats
    output: dataframe of the passing stats for all senior players in the top 5 European leagues 
    '''

    # URL html for market value table read in beautiful soup
    page = requests.get(URL).text
    soup = BeautifulSoup(page)
    
    # Finding standard stats table
    passing_table = soup.find('table', id='stats_passing')

    # Put player names in list
    players = [header for header in passing_table.find_all('td', {'class':'left','data-stat':'player'})]
    player_names = [name.text for th in players for name in th.find_all('a')]

    # Put player stats in list
    player_row = [row for row in passing_table.find_all('tr', class_= lambda x: x != 'thead')]

    player_stats = {}

    for player in player_row[2:]: 
        player_stats_list = []
        
        player_stats_list.append(player.find('td',{'data-stat':'passes_completed'}).text)        
        player_stats_list.append(player.find('td',{'data-stat':'passes'}).text)
        player_stats_list.append(player.find('td',{'data-stat':'assisted_shots'}).text)
        player_stats_list.append(player.find('td',{'data-stat':'passes_into_penalty_area'}).text)
        
        player_stats[player] = player_stats_list
        
    player_stats_list_all = [stats for stats in player_stats.values()]

    # Creating dataframe for players and stats
    player_stats_df = pd.DataFrame(player_stats_list_all)
    player_stats_df.columns = ['Passes Completed', 'Passes Attempted', 'Key Passes', 'Completed Passes into PA']
    player_stats_df.index = player_names

    return player_stats_df

In [None]:
# def passing_table(URL):
#     '''
#     input: URL of FBREF website containing soccer player stats
#     output: dataframe of the passing stats for all senior players in the top 5 European leagues 
#     '''

#     # URL html for market value table read in beautiful soup
#     page = requests.get(URL).text
#     soup = BeautifulSoup(page)
    
#     # Finding passing stats table
#     passing_table = soup.find('table', id='stats_passing')

#     # Put player names in list
#     players = [header for header in passing_table.find_all('td', {'class':'left','data-stat':'player'})]
#     player_names = [name.text for th in players for name in th.find_all('a')]

#     # Put player stats in list
#     player_row = [row for row in passing_table.find_all('tr', class_= lambda x: x != 'thead')]

#     # Organize each player's stats in a dictionary form and place them into a master dictionary (nested dictionaries)
#     player_stats = {}

#     for player in player_row[2:]: 
#         player_stats_columns = {}
        
#         player_stats_columns['Passes Completed'] = player.find('td',{'data-stat':'passes_completed'}).text        
#         player_stats_columns['Passes Attempted'] = player.find('td',{'data-stat':'passes'}).text
#         player_stats_columns['Key Passes'] = player.find('td',{'data-stat':'assisted_shots'}).text
#         player_stats_columns['Completed Passes into PA'] = player.find('td',{'data-stat':'passes_into_penalty_area'}).text
        
#         player_name = player.find('td', {'class':'left','data-stat':'player'}).text
#         player_stats[player_name] = player_stats_columns

#     # Creating dataframe for players and stats
#     player_stats_df = pd.DataFrame(player_stats).T

#     return player_stats_df

In [None]:
player_stats_pass_17_18 = passing_table('https://fbref.com/en/comps/Big5/2017-2018/passing/players/2017-2018-Big-5-European-Leagues-Stats')

In [None]:
player_stats_pass_17_18['Passes Completed'] = player_stats_pass_17_18['Passes Completed'].apply(convert_int)
player_stats_pass_17_18['Passes Attempted'] = player_stats_pass_17_18['Passes Attempted'].apply(convert_int)
player_stats_pass_17_18['Key Passes'] = player_stats_pass_17_18['Key Passes'].apply(convert_int)
player_stats_pass_17_18['Completed Passes into PA'] = player_stats_pass_17_18['Completed Passes into PA'].apply(convert_int)

player_stats_pass_17_18.dropna(inplace=True) 

In [None]:
player_stats_pass_17_18.reset_index(inplace=True) 
player_stats_pass_17_18.columns = ['Player Name', 'Passes Completed', 'Passes Attempted', 'Key Passes', 'Completed Passes into PA']
player_stats_pass_17_18

### 3.1 Create the same passing stats dataframes for 2018-2019 and 2019-2020 seasons

In [None]:
player_stats_pass_18_19 = passing_table('https://fbref.com/en/comps/Big5/2018-2019/passing/players/2018-2019-Big-5-European-Leagues-Stats')

In [None]:
player_stats_pass_18_19['Passes Completed'] = player_stats_pass_18_19['Passes Completed'].apply(convert_int)
player_stats_pass_18_19['Passes Attempted'] = player_stats_pass_18_19['Passes Attempted'].apply(convert_int)
player_stats_pass_18_19['Key Passes'] = player_stats_pass_18_19['Key Passes'].apply(convert_int)
player_stats_pass_18_19['Completed Passes into PA'] = player_stats_pass_18_19['Completed Passes into PA'].apply(convert_int)

player_stats_pass_18_19.dropna(inplace=True) 

In [None]:
player_stats_pass_18_19.reset_index(inplace=True) 
player_stats_pass_18_19.columns = ['Player Name', 'Passes Completed', 'Passes Attempted', 'Key Passes', 'Completed Passes into PA']
player_stats_pass_18_19

In [None]:
player_stats_pass_19_20 = passing_table('https://fbref.com/en/comps/Big5/2019-2020/passing/players/2019-2020-Big-5-European-Leagues-Stats')

In [None]:
player_stats_pass_19_20['Passes Completed'] = player_stats_pass_19_20['Passes Completed'].apply(convert_int)
player_stats_pass_19_20['Passes Attempted'] = player_stats_pass_19_20['Passes Attempted'].apply(convert_int)
player_stats_pass_19_20['Key Passes'] = player_stats_pass_19_20['Key Passes'].apply(convert_int)
player_stats_pass_19_20['Completed Passes into PA'] = player_stats_pass_19_20['Completed Passes into PA'].apply(convert_int)

player_stats_pass_19_20.dropna(inplace=True) 

In [None]:
player_stats_pass_19_20.reset_index(inplace=True) 
player_stats_pass_19_20.columns = ['Player Name', 'Passes Completed', 'Passes Attempted', 'Key Passes', 'Completed Passes into PA']
player_stats_pass_19_20

In [None]:
player_stats_pass_20_21 = passing_table('https://fbref.com/en/comps/Big5/2020-2021/passing/players/2020-2021-Big-5-European-Leagues-Stats')

In [None]:
player_stats_pass_20_21['Passes Completed'] = player_stats_pass_20_21['Passes Completed'].apply(convert_int)
player_stats_pass_20_21['Passes Attempted'] = player_stats_pass_20_21['Passes Attempted'].apply(convert_int)
player_stats_pass_20_21['Key Passes'] = player_stats_pass_20_21['Key Passes'].apply(convert_int)
player_stats_pass_20_21['Completed Passes into PA'] = player_stats_pass_20_21['Completed Passes into PA'].apply(convert_int)

player_stats_pass_20_21.dropna(inplace=True) 

In [None]:
player_stats_pass_20_21.reset_index(inplace=True) 
player_stats_pass_20_21.columns = ['Player Name', 'Passes Completed', 'Passes Attempted', 'Key Passes', 'Completed Passes into PA']
player_stats_pass_20_21

### 3.2 Combine the dataframes and sum the numerical stats 

In [None]:
player_stats_pass_dup = pd.concat([player_stats_pass_17_18, player_stats_pass_18_19, player_stats_pass_19_20, player_stats_pass_20_21]).sort_values(by='Player Name')
player_stats_pass_dup.head(15)

In [None]:
player_stats_pass = player_stats_pass_dup.groupby('Player Name')[['Passes Completed', 'Passes Attempted', 'Key Passes', 'Completed Passes into PA']].sum().reset_index()

In [None]:
player_stats_pass

### 3.3 Save the passing stats dataframes as .csv files

In [None]:
player_stats_pass_17_18.to_csv('../Data/player_stats_pass_17_18.csv', index=False)
player_stats_pass_18_19.to_csv('../Data/player_stats_pass_18_19.csv', index=False)
player_stats_pass_19_20.to_csv('../Data/player_stats_pass_19_20.csv', index=False)
player_stats_pass_20_21.to_csv('../Data/player_stats_pass_20_21.csv', index=False)
player_stats_pass.to_csv('../Data/player_stats_pass.csv', index=False)

### 4. Possession Stats - Number of Players Dribbled Past, Dribbles into Penalty Area


In [None]:
def possession_table(URL):
    '''
    input: URL of FBREF website containing soccer player stats
    output: dataframe of the possession stats for all senior players in the top 5 European leagues 
    '''

    # URL html for market value table read in beautiful soup
    page = requests.get(URL).text
    soup = BeautifulSoup(page)
    
    # Finding standard stats table
    possession_table = soup.find('table', id='stats_possession')

    # Put player names in list
    players = [header for header in possession_table.find_all('td', {'class':'left','data-stat':'player'})]
    player_names = [name.text for th in players for name in th.find_all('a')]

    # Put player stats in list
    player_row = [row for row in possession_table.find_all('tr', class_= lambda x: x != 'thead')]

    player_stats = {}

    for player in player_row[2:]: 
        player_stats_list = []
        
        player_stats_list.append(player.find('td',{'data-stat':'players_dribbled_past'}).text)        
        player_stats_list.append(player.find('td',{'data-stat':'carries_into_penalty_area'}).text)
        
        player_stats[player] = player_stats_list
        
    player_stats_list_all = [stats for stats in player_stats.values()]

    # Creating dataframe for players and stats
    player_stats_df = pd.DataFrame(player_stats_list_all)
    player_stats_df.columns = ['Players Dribbled Past', 'Dribbles into PA']
    player_stats_df.index = player_names

    return player_stats_df

In [None]:
# def possession_table(URL):
#     '''
#     input: URL of FBREF website containing soccer player stats
#     output: dataframe of the possession stats for all senior players in the top 5 European leagues 
#     '''

#     # URL html for market value table read in beautiful soup
#     page = requests.get(URL).text
#     soup = BeautifulSoup(page)
    
#     # Finding shooting stats table
#     possession_table = soup.find('table', id='stats_possession')

#     # Put player names in list
#     players = [header for header in possession_table.find_all('td', {'class':'left','data-stat':'player'})]
#     player_names = [name.text for th in players for name in th.find_all('a')]

#     # Put player stats in list
#     player_row = [row for row in possession_table.find_all('tr', class_= lambda x: x != 'thead')]

#     # Organize each player's stats in a dictionary form and place them into a master dictionary (nested dictionaries)
#     player_stats = {}

#     for player in player_row[2:]: 
#         player_stats_columns = {}
              
#         player_stats_columns['Players Dribbled Past'] = player.find('td',{'data-stat':'players_dribbled_past'}).text
#         player_stats_columns['Dribbles into PA'] = player.find('td',{'data-stat':'carries_into_penalty_area'}).text
        
#         player_name = player.find('td', {'class':'left','data-stat':'player'}).text
#         player_stats[player_name] = player_stats_columns

#     # Creating dataframe for players and stats
#     player_stats_df = pd.DataFrame(player_stats).T

#     return player_stats_df

In [None]:
player_stats_possess_17_18 = possession_table('https://fbref.com/en/comps/Big5/2017-2018/possession/players/2017-2018-Big-5-European-Leagues-Stats')

In [None]:
player_stats_possess_17_18['Players Dribbled Past'] = player_stats_possess_17_18['Players Dribbled Past'].apply(convert_int)
player_stats_possess_17_18['Dribbles into PA'] = player_stats_possess_17_18['Dribbles into PA'].apply(convert_int)

player_stats_possess_17_18.dropna(inplace=True) 

In [None]:
player_stats_possess_17_18.reset_index(inplace=True) 
player_stats_possess_17_18.columns = ['Player Name', 'Players Dribbled Past', 'Dribbles into PA']
player_stats_possess_17_18

### 4.1 Create the same possession stats dataframes for 2018-2019 and 2019-2020 seasons

In [None]:
player_stats_possess_18_19 = possession_table('https://fbref.com/en/comps/Big5/2018-2019/possession/players/2018-2019-Big-5-European-Leagues-Stats')

In [None]:
player_stats_possess_18_19['Players Dribbled Past'] = player_stats_possess_18_19['Players Dribbled Past'].apply(convert_int)
player_stats_possess_18_19['Dribbles into PA'] = player_stats_possess_18_19['Dribbles into PA'].apply(convert_int)

player_stats_possess_18_19.dropna(inplace=True) 

In [None]:
player_stats_possess_18_19.reset_index(inplace=True) 
player_stats_possess_18_19.columns = ['Player Name', 'Players Dribbled Past', 'Dribbles into PA']
player_stats_possess_18_19

In [None]:
player_stats_possess_19_20 = possession_table('https://fbref.com/en/comps/Big5/2019-2020/possession/players/2019-2020-Big-5-European-Leagues-Stats')

In [None]:
player_stats_possess_19_20['Players Dribbled Past'] = player_stats_possess_19_20['Players Dribbled Past'].apply(convert_int)
player_stats_possess_19_20['Dribbles into PA'] = player_stats_possess_19_20['Dribbles into PA'].apply(convert_int)

player_stats_possess_19_20.dropna(inplace=True) 

In [None]:
player_stats_possess_19_20.reset_index(inplace=True) 
player_stats_possess_19_20.columns = ['Player Name', 'Players Dribbled Past', 'Dribbles into PA']
player_stats_possess_19_20

In [None]:
player_stats_possess_20_21 = possession_table('https://fbref.com/en/comps/Big5/2020-2021/possession/players/2020-2021-Big-5-European-Leagues-Stats')

In [None]:
player_stats_possess_20_21['Players Dribbled Past'] = player_stats_possess_20_21['Players Dribbled Past'].apply(convert_int)
player_stats_possess_20_21['Dribbles into PA'] = player_stats_possess_20_21['Dribbles into PA'].apply(convert_int)

player_stats_possess_20_21.dropna(inplace=True) 

In [None]:
player_stats_possess_20_21.reset_index(inplace=True) 
player_stats_possess_20_21.columns = ['Player Name', 'Players Dribbled Past', 'Dribbles into PA']
player_stats_possess_20_21

### 4.2 Combine the dataframes and sum the numerical stats

In [None]:
player_stats_possess_dup = pd.concat([player_stats_possess_17_18, player_stats_possess_18_19, player_stats_possess_19_20, player_stats_possess_20_21]).sort_values(by='Player Name')
player_stats_possess_dup.head(15)

In [None]:
player_stats_possess = player_stats_possess_dup.groupby('Player Name')[['Players Dribbled Past', 'Dribbles into PA']].sum().reset_index()

In [None]:
player_stats_possess

### 4.3 Save the possession stats dataframes as .csv files

In [None]:
player_stats_possess_17_18.to_csv('../Data/player_stats_possess_17_18.csv', index=False)
player_stats_possess_18_19.to_csv('../Data/player_stats_possess_18_19.csv', index=False)
player_stats_possess_19_20.to_csv('../Data/player_stats_possess_19_20.csv', index=False)
player_stats_possess_20_21.to_csv('../Data/player_stats_possess_20_21.csv', index=False)
player_stats_possess.to_csv('../Data/player_stats_possess.csv', index=False)

### 5. Merge all player stats (standard, shooting, passing, and possession) into a single dataframe

In [None]:
player_stats_stand.set_index('Player Name', inplace=True)
player_stats_shoot.set_index('Player Name', inplace=True)
player_stats_pass.set_index('Player Name', inplace=True)
player_stats_possess.set_index('Player Name', inplace=True)

In [None]:
player_stats_all = pd.concat([player_stats_stand, player_stats_shoot, player_stats_pass, player_stats_possess], axis=1).reset_index()

In [None]:
#player_stats_all = player_stats_all.T.reset_index().drop_duplicates(subset='index', keep='first').set_index('index').T

In [None]:
#player_stats_all.dropna(inplace=True)

In [None]:
player_stats_all.columns = ['Player Name', 'League', 'Position', 'Age', 'Matches Played', 'Starts',
       '90s Played', 'Goals', 'Assists', 'Yellow Cards', 'Red Cards',
       'Total Shots', 'Shots on Target', 'Passes Completed',
       'Passes Attempted', 'Key Passes', 'Completed Passes into PA',
       'Players Dribbled Past', 'Dribbles into PA']

In [None]:
player_stats_all.dropna(inplace=True)

In [None]:
player_stats_all.reset_index(drop=True, inplace=True)

In [None]:
player_stats_all

In [None]:
player_stats_all.to_csv('../Data/player_stats_all.csv',index=False)

### 6. Merge player stats and market values into a final dataframe

In [None]:
player_market_values = pd.read_csv('../Data/pmv_table.csv')

In [None]:
player_stats_all

In [None]:
player_stats_all.set_index('Player Name', inplace=True)
player_market_values.set_index('Player Name', inplace=True)

In [None]:
player_stats_values_df = pd.concat([player_stats_all,player_market_values], axis=1)

In [None]:
player_stats_values_df

In [None]:
player_stats_values_df.dropna(inplace=True)

In [None]:
player_stats_values_df = player_stats_values_df.sort_values(by='Current Market Value (USD)', ascending=False).reset_index()
player_stats_values_df.columns = ['Player Name', 'League', 'Position', 'Age', 'Matches Played', 'Starts',
       '90s Played', 'Goals', 'Assists', 'Yellow Cards', 'Red Cards',
       'Total Shots', 'Shots on Target', 'Passes Completed',
       'Passes Attempted', 'Key Passes', 'Completed Passes into PA',
       'Players Dribbled Past', 'Dribbles into PA',
       'Current Market Value (USD)']

In [None]:
player_stats_values_df

In [None]:
player_stats_values_df.to_csv('../Data/player_stats_values.csv', index=False)