### Predicting transfer value of football players using ML (statistics and additional attributes)

In [1]:
# Importing relevant libaries
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import YouTubeVideo
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup, NavigableString, Tag
import os




In [2]:
#Specify relevant seasons and leagues
years = ['2017-2018','2018-2019','2019-2020','2019-2020','2021-2022']
leagues = ['spanish-la-liga','italian-serie-a','german-bundesliga','english-premier-league','rest-of-europe']

#Test
years_test = ['2017-2018','2018-2019']
leagues_test = ['english-premier-league']

In [7]:
def scrape(driver,id):
    '''
    Sets up a function for scraping data based on a given driver

    args: driver (defines which driver to scrape from)
    '''

    soup = BeautifulSoup(driver.page_source, 'lxml')
    table_node = soup.find(id=id)

    columns_html = table_node.thead.find_all('th')
        # Extract the text
    columns = []
    for col in columns_html:
        columns.append(col.text)
    columns = columns[7:]

    rows_list = table_node.tbody.find_all('tr')

    data = []
    for row_node in rows_list:
        # if (rows_list.index(row_node) % 25 == 0):
        try:
            row = []
            for child in row_node.children:
                row.append(child.text)
            data.append(row)
        except:
            pass
    
    return columns, data

def clean_statistics(columns,data,period):
    df_out = pd.DataFrame(data,columns=columns)\
        .iloc[:,:-1]\
        .assign(Nation= lambda x: x['Nation'].str.split(' ').str[1],
            Pos = lambda x: x['Pos'].str.split(',').str[0], 
            Comp = lambda x: x['Comp'].str.split(None,1).str[1],
            # Pos = lambda x: pd.Categorical(x['Pos']),
            Squad = lambda x: pd.Categorical(x['Squad']),
            Year = period[0:4])
            # Nation = lambda x: pd.Categorical(x['Nation']))

    return df_out

In [9]:
def get_data(periods):
        
    df_list = []
    id = 'stats_standard'
    for period in periods:
        url = f'https://fbref.com/en/comps/Big5/{period}/stats/players/{period}-Big-5-European-Leagues-Stats'
        driver = webdriver.Chrome(ChromeDriverManager().install()) #driver
        driver.get(url)
        time.sleep(3)
        cookie = driver.find_element(By.CLASS_NAME,'qc-cmp2-summary-buttons').click()

        columns, data = scrape(driver,id)

        df_list.append(clean_statistics(columns, data, period))

        # time.sleep(10)
        
    main_df = pd.concat(df_list)
    print(f'Number of observations in main dataset is: {len(main_df)}')
    return main_df

#Call function
stats_df = get_data(years_test)
stats_df.to_csv('Stats.csv')


  driver = webdriver.Chrome(ChromeDriverManager().install()) #driver


Number of observations in main dataset is: 5347


In [31]:
def scrape_transferprice(driver,id):
    soup = BeautifulSoup(driver.page_source, 'lxml')
    table_node = soup.find(id=id)

    columns_html = table_node.thead.find_all('th')
        # Extract the text
    transfer_columns = []
    for col in columns_html:
        transfer_columns.append(col.text)

    rows_list = table_node.tbody.find_all('tr')

    transfer_data = []
    for row_node in rows_list:
        row = []
        for child in row_node.children:
            try:
                row.append(child.text)
            except:
                pass
        transfer_data.append(row)
    
    return transfer_columns, transfer_data

def clean_transfer(columns,data,period):    
    transfer_df = pd.DataFrame(data, columns=columns)\
                .assign(When = lambda x: x['When'].str.split(' ').str[0],
                    Price = lambda x: x['Price'].str.split('(?:(.*\d))?(?:([a-zA-Z]+))?').str[1],
                    Player = lambda x: x['Player'].str.rsplit(' ').str[:-1].apply(' '.join),
                    Year = int(period[0:4])-1)\
                .query("When != 'Jan'")\
                .replace(to_replace='None', value=np.nan).dropna()

    transfer_df.drop(transfer_df.columns[[0,1,4]],axis=1, inplace=True)


    return transfer_df                         

In [32]:
def get_transferprice(leagues,years):
    id = 'epl'
    df_transferprice = []
    for league in leagues:

        df_transferprice_ = []
        
        for periods in years:
            url = f'https://www.soccernews.com/soccer-transfers/{league}-transfers-{periods}/'
            driver = webdriver.Chrome(ChromeDriverManager().install()) #driver
            driver.get(url)
            time.sleep(3)
            cookie = driver.find_element(By.ID,'cn-accept-cookie').click()
            
            # Get data
            columns_transfer, transfer_data = scrape_transferprice(driver,id)

            df_transferprice_.append(clean_transfer(columns_transfer,transfer_data,periods))
    
        df_transferprice.append(pd.concat(df_transferprice_))

    return pd.concat(df_transferprice)
#     main_transfer = pd.concat(df_transferprice)
#     print(f'Number of observations in main dataset is: {len(main_transfer)}')

#     return main_transfer

# main_transfer_df = get_transferprice(leagues_test,years)

In [33]:
df_transferprice = get_transferprice(leagues_test,years_test)

  driver = webdriver.Chrome(ChromeDriverManager().install()) #driver
  driver = webdriver.Chrome(ChromeDriverManager().install()) #driver


In [34]:
print(df_transferprice)

                  Player                 From              To Price  Year
43           Nahki Wells         Huddersfield         Burnley   5.4  2016
44    Ezequiel Schelotto          Sporting CP        Brighton     3  2016
46      Marvin Zeegelaar          Sporting CP         Watford     3  2016
48         Nikola Vlasic         Hajduk Split         Everton   8.6  2016
50          Serge Aurier                  PSG       Tottenham    25  2016
..                   ...                  ...             ...   ...   ...
138              Fabinho            AS Monaco       Liverpool    45  2017
139        Florin Andone  Deportivo La Coruna        Brighton     6  2017
141      Ricardo Pereira             FC Porto  Leicester City    22  2017
142  Florent Hadergjonaj           Ingolstadt    Huddersfield     5  2017
143          Jonas Lossl                Mainz    Huddersfield  2.25  2017

[138 rows x 5 columns]


### Trying to merge transfer data with statistics

### **Hvis du har mod på det må du meget gerne kigge på dette, Carl Philip**

De to datasæt skal merges så vi ender med et samlet datasæt med stats og transferpris

In [35]:
print(stats_df['Player'] == 'Fabinho')

0       False
1       False
2       False
3       False
4       False
        ...  
2651    False
2652    False
2653    False
2654    False
2655    False
Name: Player, Length: 5347, dtype: bool


In [40]:
test_stats_merge = stats_df.loc[stats_df['Player'] == 'Fabinho']

In [41]:
test_stats_merge

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,xG,npxG,xA,npxG+xA,xG.1,xA.1,xG+xA,npxG.1,npxG+xA.1,Year
778,779,Fabinho,BRA,MF,Monaco,Ligue 1,23,1993,34,34,...,5.4,2.4,3.8,6.2,0.16,0.11,0.27,0.07,0.18,2017
765,766,Fabinho,BRA,MF,Liverpool,Premier League,24,1993,28,21,...,0.8,0.8,1.3,2.1,0.04,0.06,0.09,0.04,0.09,2018


In [42]:
test_transfer_merge = df_transferprice.loc[df_transferprice['Player'] == 'Fabinho']

In [46]:
test_transfer_merge = test_transfer_merge.iloc[0]

In [48]:
new_df = test_stats_merge.merge(test_transfer_merge, how='left', left_on=['Player','Year'], right_on=['Price'])

ValueError: len(right_on) must equal len(left_on)