## Metis Project 2 - Predicting Market Values of Soccer Players in the Top 5 European Leagues Using Linear Regression

### Creating a Dataframe of Player Market Values (Target Variable)

In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import time, os
import requests

In [2]:
def market_value_table(URL):
    '''
    input: URL of transfermarkt.us containing player market value data
    output: dataframe of the market values for all senior players in the top 5 European leagues 
    '''
    # Allows access to the transfermrkt.us website for web scrapping
    headers = {'User-Agent': 
               'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
    
    # URL html for market value table read in beautiful soup
    page = requests.get(URL, headers = headers)
    soup = BeautifulSoup(page.content)
    
    # Player market values in list
    player_row = [row for row in soup.find('div',id='yw1').find_all('tr', class_=['odd','even'])]
    
    player_market_value = {}
    for player in player_row:
        player_name = player.find('td', class_='hauptlink').text.strip()
        player_market_value[player_name] = player.find('td', class_=['rechts hauptlink','rechts hauptlink mwHoechstwertKarriere']).text.strip()
    
    # Create a dataframe for player names and market values
    market_values_df = pd.DataFrame(player_market_value, index=[0]).T.reset_index() 
    market_values_df.columns = ['Player Name','Current Market Value']
    
    # Data cleaning
    def convert_market_value(value):
        list_value = re.split('(\d+)', value)
        if list_value[-1] == 'm':
            return float(''.join(list_value[1:4]))*1000000
        elif list_value[-1] == 'Th.':
            return int(list_value[1])*1000
        
    market_values_df['Current Market Value (USD)'] = market_values_df['Current Market Value'].apply(convert_market_value)
    market_values_df.drop('Current Market Value', axis=1, inplace=True)
    
    return market_values_df
    

In [3]:
market_value_table('https://www.transfermarkt.us/premier-league/marktwertaenderungen/wettbewerb/GB1')

Unnamed: 0,Player Name,Current Market Value (USD)
0,Billy Arce,660000.0
1,Amad Diallo,16500000.0
2,Patrick Cutrone,13200000.0
3,Frederik Alves,440000.0
4,Filip Stevanovic,8800000.0
5,Winston Reid,1650000.0
6,Raheem Sterling,121000000.0
7,Heung-min Son,99000000.0
8,Bruno Fernandes,99000000.0
9,Bernardo Silva,77000000.0


In [4]:
def concat_table_epl():
    '''
    Concentenates EPL player market value tables that span multiple pages on the website
    '''
    appended_data = []
    for i in range(24):
        URL_page = 'https://www.transfermarkt.us/premier-league/marktwertaenderungen/wettbewerb/GB1/page/{}'.format(i)
        data = market_value_table(URL_page)
        appended_data.append(data)
        
    return pd.concat(appended_data)

In [5]:
# pmv short for 'player market value'
pmv_table_epl = concat_table_epl().reset_index(drop=True)
pmv_table_epl

Unnamed: 0,Player Name,Current Market Value (USD)
0,Billy Arce,660000.0
1,Amad Diallo,16500000.0
2,Patrick Cutrone,13200000.0
3,Frederik Alves,440000.0
4,Filip Stevanovic,8800000.0
...,...,...
572,Elia Caprile,110000.0
573,Sam Woods,83000.0
574,Lukas Jensen,28000.0
575,Jakub Moder,3300000.0


In [6]:
def concat_table_laliga():
    '''
    Concentenates La Liga player market value tables that span multiple pages on the website
    '''
    appended_data = []
    for i in range(22):
        URL_page = 'https://www.transfermarkt.us/primera-division/marktwertaenderungen/wettbewerb/ES1/page/{}'.format(i)
        data = market_value_table(URL_page)
        appended_data.append(data)
        
    return pd.concat(appended_data)

In [7]:
pmv_table_laliga = concat_table_laliga().reset_index(drop=True)
pmv_table_laliga

Unnamed: 0,Player Name,Current Market Value (USD)
0,Óscar Mingueza,5500000.0
1,Konrad de la Fuente,2750000.0
2,Iñaki Peña,2750000.0
3,Arnau Tenas,1650000.0
4,Gabriel Veiga,1100000.0
...,...,...
530,Mikel Rico,330000.0
531,Luisinho,220000.0
532,Cristian Rivero,220000.0
533,Pedro López,110000.0


In [8]:
def concat_table_seriea():
    '''
    Concentenates Serie A player market value tables that span multiple pages on the website
    '''
    appended_data = []
    for i in range(25):
        URL_page = 'https://www.transfermarkt.us/serie-a/marktwertaenderungen/wettbewerb/IT1/page/{}'.format(i)
        data = market_value_table(URL_page)
        appended_data.append(data)
        
    return pd.concat(appended_data)

In [9]:
pmv_table_seriea = concat_table_seriea().reset_index(drop=True)
pmv_table_seriea

Unnamed: 0,Player Name,Current Market Value (USD)
0,Joakim Maehle,12100000.0
1,Romelu Lukaku,99000000.0
2,Matthijs de Ligt,82500000.0
3,Paulo Dybala,77000000.0
4,Lautaro Martínez,77000000.0
...,...,...
602,Maximiliano Olivera,1650000.0
603,Adama Soumaoro,5500000.0
604,Daniel Bessa,2970000.0
605,Ernesto Torregrossa,4950000.0


In [10]:
def concat_table_bundesliga():
    '''
    Concentenates Bundesliga player market value tables that span multiple pages on the website
    '''
    appended_data = []
    for i in range(22):
        URL_page = 'https://www.transfermarkt.us/1-bundesliga/marktwertaenderungen/wettbewerb/L1/page/{}'.format(i)
        data = market_value_table(URL_page)
        appended_data.append(data)
        
    return pd.concat(appended_data)

In [11]:
pmv_table_bundesliga = concat_table_bundesliga().reset_index(drop=True)
pmv_table_bundesliga

Unnamed: 0,Player Name,Current Market Value (USD)
0,Tim Civeja,330000.0
1,Dion Berisha,330000.0
2,Seong-hoon Cheon,275000.0
3,Benjamin Leneis,220000.0
4,Ansgar Knauff,550000.0
...,...,...
525,Fabrice Hartmann,385000.0
526,Luca Unbehaun,385000.0
527,Marijan Cavar,385000.0
528,Torben Müsel,358000.0


In [12]:
def concat_table_ligue1():
    '''
    Concentenates Bundesliga player market value tables that span multiple pages on the website
    '''
    appended_data = []
    for i in range(24):
        URL_page = 'https://www.transfermarkt.us/ligue-1/marktwertaenderungen/wettbewerb/FR1/page/{}'.format(i)
        data = market_value_table(URL_page)
        appended_data.append(data)
        
    return pd.concat(appended_data)

In [13]:
pmv_table_ligue1 = concat_table_ligue1().reset_index(drop=True)
pmv_table_ligue1

Unnamed: 0,Player Name,Current Market Value (USD)
0,Eduardo Camavinga,66000000.0
1,Presnel Kimpembe,44000000.0
2,Moussa Dembélé,33000000.0
3,Moise Kean,33000000.0
4,Renato Sanches,30800000.0
...,...,...
585,Eiji Kawashima,165000.0
586,Dialy Ndiaye,165000.0
587,Hilton,110000.0
588,David Kong,110000.0


In [23]:
pmv_table = pd.concat([pmv_table_epl,pmv_table_laliga,pmv_table_seriea,pmv_table_bundesliga,pmv_table_ligue1]).reset_index(drop=True)

In [25]:
pmv_table.drop_duplicates(subset='Player Name', inplace=True)

In [27]:
pmv_table.reset_index(drop=True, inplace=True)

In [28]:
pmv_table

Unnamed: 0,Player Name,Current Market Value (USD)
0,Billy Arce,660000.0
1,Amad Diallo,16500000.0
2,Patrick Cutrone,13200000.0
3,Frederik Alves,440000.0
4,Filip Stevanovic,8800000.0
...,...,...
2701,Eiji Kawashima,165000.0
2702,Dialy Ndiaye,165000.0
2703,Hilton,110000.0
2704,David Kong,110000.0


In [1]:
pmv_table.to_csv('/Data/pmv_table.csv', index=False)

NameError: name 'pmv_table' is not defined