## Metis Project 2 - Predicting Market Values of Soccer Players in the Top 5 European Leagues Using Linear Regression

### Creating a Dataframe of Player Market Values (Target Variable)

In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import time, os
import requests

In [2]:
def market_value_table(URL):
    '''
    input: URL of transfermarkt.us containing player market value data
    output: dataframe of the market values for all senior players in the top 5 European leagues 
    '''
    # Allows access to the transfermrkt.us website for web scrapping
    headers = {'User-Agent': 
               'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
    
    # URL html for market value table read in beautiful soup
    page = requests.get(URL, headers = headers)
    soup = BeautifulSoup(page.content)
    
    # Player market values in list
    player_row = [row for row in soup.find('div',id='yw1').find_all('tr', class_=['odd','even'])]
    
    player_market_value = {}
    for player in player_row:
        player_name = player.find('td', class_='hauptlink').text.strip()
        player_market_value[player_name] = player.find('td', class_=['rechts hauptlink','rechts hauptlink mwHoechstwertKarriere']).text.strip()
    
    # Create a dataframe for player names and market values
    market_values_df = pd.DataFrame(player_market_value, index=[0]).T.reset_index() 
    market_values_df.columns = ['Player Name','Current Market Value']
    
    # Data cleaning
    def convert_market_value(value):
        list_value = re.split('(\d+)', value)
        if list_value[-1] == 'm':
            return float(''.join(list_value[1:4]))*1000000
        elif list_value[-1] == 'Th.':
            return int(list_value[1])*1000
        
    market_values_df['Current Market Value (USD)'] = market_values_df['Current Market Value'].apply(convert_market_value)
    market_values_df.drop('Current Market Value', axis=1, inplace=True)
    
    return market_values_df
    

In [3]:
market_value_table('https://www.transfermarkt.us/premier-league/marktwertaenderungen/wettbewerb/GB1')

Unnamed: 0,Player Name,Current Market Value (USD)
0,Jakub Moder,11000000.0
1,Michal Karbownik,6600000.0
2,Billy Arce,660000.0
3,Amad Diallo,16500000.0
4,Patrick Cutrone,13200000.0
5,Frederik Alves,440000.0
6,Winston Reid,1650000.0
7,Raheem Sterling,121000000.0
8,Heung-min Son,99000000.0
9,Bruno Fernandes,99000000.0


In [4]:
def concat_table_epl():
    '''
    Concentenates EPL player market value tables that span multiple pages on the website
    '''
    appended_data = []
    for i in range(24):
        URL_page = 'https://www.transfermarkt.us/premier-league/marktwertaenderungen/wettbewerb/GB1/page/{}'.format(i)
        data = market_value_table(URL_page)
        appended_data.append(data)
        
    return pd.concat(appended_data)

In [5]:
# pmv short for 'player market value'
pmv_table_epl = concat_table_epl().reset_index(drop=True)
pmv_table_epl

Unnamed: 0,Player Name,Current Market Value (USD)
0,Jakub Moder,11000000.0
1,Michal Karbownik,6600000.0
2,Billy Arce,660000.0
3,Amad Diallo,16500000.0
4,Patrick Cutrone,13200000.0
...,...,...
583,Elia Caprile,110000.0
584,Sam Woods,83000.0
585,Lukas Jensen,28000.0
586,Jean-Philippe Mateta,16500000.0


In [6]:
def concat_table_laliga():
    '''
    Concentenates La Liga player market value tables that span multiple pages on the website
    '''
    appended_data = []
    for i in range(22):
        URL_page = 'https://www.transfermarkt.us/primera-division/marktwertaenderungen/wettbewerb/ES1/page/{}'.format(i)
        data = market_value_table(URL_page)
        appended_data.append(data)
        
    return pd.concat(appended_data)

In [7]:
pmv_table_laliga = concat_table_laliga().reset_index(drop=True)
pmv_table_laliga

Unnamed: 0,Player Name,Current Market Value (USD)
0,Javi Navarro,220000.0
1,Óscar Mingueza,5500000.0
2,Konrad de la Fuente,2750000.0
3,Iñaki Peña,2750000.0
4,Gabriel Veiga,1100000.0
...,...,...
527,Pedro López,110000.0
528,Nino,110000.0
529,Johan Mojica,2750000.0
530,Aleix García,2200000.0


In [8]:
def concat_table_seriea():
    '''
    Concentenates Serie A player market value tables that span multiple pages on the website
    '''
    appended_data = []
    for i in range(25):
        URL_page = 'https://www.transfermarkt.us/serie-a/marktwertaenderungen/wettbewerb/IT1/page/{}'.format(i)
        data = market_value_table(URL_page)
        appended_data.append(data)
        
    return pd.concat(appended_data)

In [9]:
pmv_table_seriea = concat_table_seriea().reset_index(drop=True)
pmv_table_seriea

Unnamed: 0,Player Name,Current Market Value (USD)
0,Kevin Strootman,5500000.0
1,Joakim Maehle,12100000.0
2,Romelu Lukaku,99000000.0
3,Matthijs de Ligt,82500000.0
4,Sergej Milinkovic-Savic,77000000.0
...,...,...
614,Mario Mandzukic,2750000.0
615,Adama Soumaoro,5500000.0
616,Daniel Bessa,2970000.0
617,Ernesto Torregrossa,4950000.0


In [10]:
def concat_table_bundesliga():
    '''
    Concentenates Bundesliga player market value tables that span multiple pages on the website
    '''
    appended_data = []
    for i in range(22):
        URL_page = 'https://www.transfermarkt.us/1-bundesliga/marktwertaenderungen/wettbewerb/L1/page/{}'.format(i)
        data = market_value_table(URL_page)
        appended_data.append(data)
        
    return pd.concat(appended_data)

In [11]:
pmv_table_bundesliga = concat_table_bundesliga().reset_index(drop=True)
pmv_table_bundesliga

Unnamed: 0,Player Name,Current Market Value (USD)
0,Chris Richards,5500000.0
1,Tiago Dantas,4400000.0
2,Ron-Thorben Hoffmann,550000.0
3,Angelo Stiller,2750000.0
4,Bright Arrey-Mbi,1100000.0
...,...,...
543,Fabian Bredlow,440000.0
544,Luca Unbehaun,385000.0
545,Fabrice Hartmann,385000.0
546,Philipp Tschauner,358000.0


In [12]:
def concat_table_ligue1():
    '''
    Concentenates Bundesliga player market value tables that span multiple pages on the website
    '''
    appended_data = []
    for i in range(24):
        URL_page = 'https://www.transfermarkt.us/ligue-1/marktwertaenderungen/wettbewerb/FR1/page/{}'.format(i)
        data = market_value_table(URL_page)
        appended_data.append(data)
        
    return pd.concat(appended_data)

In [13]:
pmv_table_ligue1 = concat_table_ligue1().reset_index(drop=True)
pmv_table_ligue1

Unnamed: 0,Player Name,Current Market Value (USD)
0,Eduardo Camavinga,66000000.0
1,Presnel Kimpembe,44000000.0
2,Moise Kean,33000000.0
3,Renato Sanches,30800000.0
4,Jonathan Ikoné,30800000.0
...,...,...
583,Charly Jan,165000.0
584,Yanis Saidani,165000.0
585,Hilton,110000.0
586,David Kong,110000.0


In [14]:
pmv_table = pd.concat([pmv_table_epl,pmv_table_laliga,pmv_table_seriea,pmv_table_bundesliga,pmv_table_ligue1]).reset_index(drop=True)

In [15]:
pmv_table.drop_duplicates(subset='Player Name', inplace=True)

In [16]:
pmv_table.reset_index(drop=True, inplace=True)

In [17]:
pmv_table

Unnamed: 0,Player Name,Current Market Value (USD)
0,Jakub Moder,11000000.0
1,Michal Karbownik,6600000.0
2,Billy Arce,660000.0
3,Amad Diallo,16500000.0
4,Patrick Cutrone,13200000.0
...,...,...
2672,Charly Jan,165000.0
2673,Yanis Saidani,165000.0
2674,Hilton,110000.0
2675,David Kong,110000.0


In [20]:
pmv_table.to_csv('../Data/pmv_table.csv', index=False)