### Predicting transfer value of football players using ML (statistics and additional attributes)

In [205]:
# Importing relevant libaries
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import YouTubeVideo
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup, NavigableString, Tag


In [209]:
years = ['2018-2019','2019-2020']
leagues_test = ['spanish-la-liga','italian-serie-a']
leagues = ['spanish-la-liga','italian-serie-a','german-bundesliga','english-premier-league','rest-of-europe']

In [210]:
def scrape(driver,id):
    '''
    Sets up a function for scraping data based on a given driver

    args: driver (defines which driver to scrape from)
    '''

    soup = BeautifulSoup(driver.page_source, 'lxml')
    table_node = soup.find(id=id)

    columns_html = table_node.thead.find_all('th')
        # Extract the text
    columns = []
    for col in columns_html:
        columns.append(col.text)
    columns = columns[7:]

    rows_list = table_node.tbody.find_all('tr')

    data = []
    fejl = []
    for row_node in rows_list:
        # if (rows_list.index(row_node) % 25 == 0):
        try:
            row = []
            for child in row_node.children:
                row.append(child.text)
            data.append(row)
        except:
            print(rows_list.index(row_node))
            fejl.append(rows_list.index(row_node)) # Column appear every 25th row
    
    return columns, data

def clean_statistics(columns,data,period):
    df_out = pd.DataFrame(data,columns=columns)\
        .iloc[:,:-1]\
        .assign(Nation= lambda x: x['Nation'].str.split(' ').str[1],
            Pos = lambda x: x['Pos'].str.split(',').str[0], 
            Comp = lambda x: x['Comp'].str.split(None,1).str[1],
            # Pos = lambda x: pd.Categorical(x['Pos']),
            Squad = lambda x: pd.Categorical(x['Squad']),
            Year = period[0:4])
            # Nation = lambda x: pd.Categorical(x['Nation']))

    return df_out

In [211]:
def get_data(periods):
    df_list = []
    id = 'stats_standard'
    for period in periods:

        url = f'https://fbref.com/en/comps/Big5/{period}/stats/players/{period}-Big-5-European-Leagues-Stats'
        driver = webdriver.Chrome(ChromeDriverManager().install()) #driver
        driver.get(url)
        time.sleep(3)
        cookie = driver.find_element(By.CLASS_NAME,'qc-cmp2-summary-buttons').click()

        columns, data = scrape(driver,id)

        df_list.append(clean_statistics(columns, data, period))

        # time.sleep(10)
        
    main_df = pd.concat(df_list)
    print(f'Number of observations in main dataset is: {len(main_df)}')
    return main_df

#Call function
main_df = get_data(years)

  driver = webdriver.Chrome(ChromeDriverManager().install()) #driver


25
51
77
103
129
155
181
207
233
259
285
311
337
363
389
415
441
467
493
519
545
571
597
623
649
675
701
727
753
779
805
831
857
883
909
935
961
987
1013
1039
1065
1091
1117
1143
1169
1195
1221
1247
1273
1299
1325
1351
1377
1403
1429
1455
1481
1507
1533
1559
1585
1611
1637
1663
1689
1715
1741
1767
1793
1819
1845
1871
1897
1923
1949
1975
2001
2027
2053
2079
2105
2131
2157
2183
2209
2235
2261
2287
2313
2339
2365
2391
2417
2443
2469
2495
2521
2547
2573
2599
2625
2651
2677
2703
2729
2755
25
51
77
103
129
155
181
207
233
259
285
311
337
363
389
415
441
467
493
519
545
571
597
623
649
675
701
727
753
779
805
831
857
883
909
935
961
987
1013
1039
1065
1091
1117
1143
1169
1195
1221
1247
1273
1299
1325
1351
1377
1403
1429
1455
1481
1507
1533
1559
1585
1611
1637
1663
1689
1715
1741
1767
1793
1819
1845
1871
1897
1923
1949
1975
2001
2027
2053
2079
2105
2131
2157
2183
2209
2235
2261
2287
2313
2339
2365
2391
2417
2443
2469
2495
2521
2547
2573
2599
2625
2651
2677
2703
2729
2755
2781
2807
2833
Number 

In [212]:
def scrape_transferprice(driver,id):
    soup = BeautifulSoup(driver.page_source, 'lxml')
    table_node = soup.find(id=id)

    columns_html = table_node.thead.find_all('th')
        # Extract the text
    transfer_columns = []
    for col in columns_html:
        transfer_columns.append(col.text)


    rows_list = table_node.tbody.find_all('tr')

    transfer_data = []
    for row_node in rows_list:
        row = []
        for child in row_node.children:
            try:
                row.append(child.text)
            except:
                pass
        transfer_data.append(row)

def clean_transfer(columns,data,period):    
    transfer_df = pd.DataFrame(data, columns=columns)\
                .assign(Price = lambda x: x['Price'].str.split('(?:(.*\d))?(?:([a-zA-Z]+))?').str[1],   
                    When = lambda x: x['When'].str.split(' ').str[0],
                    Player = lambda x: x['Player'].str.rsplit(' ').str[:-1].apply(' '.join),
                    Year = period[0:4])\
                .query("When != 'Jan'")\
                                    

In [215]:
def get_transferprice(league,years):
    id = 'epl'
    df_transferprice = []
    for league in leagues:
        for periods in years:
            url = f'https://www.soccernews.com/soccer-transfers/{league}-transfers-{periods}/'
            driver = webdriver.Chrome(ChromeDriverManager().install()) #driver
            driver.get(url)
            time.sleep(3)
            cookie = driver.find_element(By.ID,'cn-accept-cookie').click()
            
            # Get data
            columns_transfer, transfer_data = scrape_transferprice(driver,id)

            df_transferprice.append(clean_transfer(columns_transfer,transfer_data,years))
    
    return df_transferprice
#     main_transfer = pd.concat(df_transferprice)
#     print(f'Number of observations in main dataset is: {len(main_transfer)}')

#     return main_transfer

# main_transfer_df = get_transferprice(leagues_test,years)

In [216]:
df_transferprice = get_transferprice(leagues_test,years)

  driver = webdriver.Chrome(ChromeDriverManager().install()) #driver


TypeError: cannot unpack non-iterable NoneType object

In [217]:
id = 'epl'
df_transferprice = []
url = 'https://www.soccernews.com/soccer-transfers/spanish-la-liga-transfers-2018-2019/'
driver = webdriver.Chrome(ChromeDriverManager().install()) #driver
driver.get(url)
time.sleep(3)
cookie = driver.find_element(By.ID,'cn-accept-cookie').click()

# Get data
columns_transfer, transfer_data = scrape_transferprice(driver,id)

df_transferprice.append(clean_transfer(columns_transfer,transfer_data,years))


  driver = webdriver.Chrome(ChromeDriverManager().install()) #driver


TypeError: cannot unpack non-iterable NoneType object

In [243]:
df_transferprice = []
id = 'epl'
league = ['english-premier-league']
years = ['2018-2019','2019-2020']
for league in leagues:

    df_transferprice_ = []

    for periods in years:
        url = f'https://www.soccernews.com/soccer-transfers/{league}-transfers-{periods}/'
        driver = webdriver.Chrome(ChromeDriverManager().install()) #driver
        driver.get(url)
        time.sleep(3)
        cookie = driver.find_element(By.ID,'cn-accept-cookie').click()

        soup = BeautifulSoup(driver.page_source, 'lxml')
        table_node = soup.find(id=id)

        columns_html = table_node.thead.find_all('th')
            # Extract the text
        transfer_columns = []
        for col in columns_html:
            transfer_columns.append(col.text)


        rows_list = table_node.tbody.find_all('tr')

        transfer_data = []
        for row_node in rows_list:
            row = []
            for child in row_node.children:
                try:
                    row.append(child.text)
                except:
                    pass
            transfer_data.append(row)

            
        transfer_df = pd.DataFrame(transfer_data, columns=transfer_columns)\
                    .assign(Price = lambda x: x['Price'].str.split('(?:(.*\d))?(?:([a-zA-Z]+))?').str[1],   
                        When = lambda x: x['When'].str.split(' ').str[0],
                        Player = lambda x: x['Player'].str.rsplit(' ').str[:-1].apply(' '.join))\
                    .query("When != 'Jan'")
        
        df_transferprice_.append(transfer_df)
    
    main = pd.concat(df_transferprice_)

    df_transferprice.append(main)

main_main = pd.concat(df_transferprice)

  driver = webdriver.Chrome(ChromeDriverManager().install()) #driver


In [245]:
print(len(main_main))

1232


In [241]:
test_df = pd.concat(df_transferprice)
test_df.to_csv('test123.csv')

TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid

In [239]:
print(len(test_df))

642
