# Web Scrape the ESPN Soccer teams
## Author: Pedro Sanhueza
## Website: [www.foxsports.com](https://www.foxsports.com/soccer/2022-fifa-world-cup/teams)

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
from datetime import datetime

In [2]:
url_main = 'https://www.foxsports.com/soccer/2022-fifa-world-cup/teams'
response = requests.get(url_main)
soup = BeautifulSoup(response.text, 'html.parser')

In [3]:
country_name = [x.text for x in soup.find_all('h3')]
countries = [x['href'] for x in soup.find_all('a',{'class':'entity-list-row-container image-logo'})]

In [4]:
# running time: 1m 30s

rows=[]
for idx,country in enumerate(countries):

    url = f'https://www.foxsports.com{country}-roster'
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    table = soup.find('div',{'view':'team'})
    title = [x.text.strip() for x in table.find_all('tr')[0]]
    
    for group in table.find_all('tbody')[:-1]:
        for player in group:
            row={}
            row['Country'] = country_name[idx].capitalize()
            row['Name'] = player.find('h3').text
            row[title[1]] = player.find('td',{'data-index':'1'}).text.strip()
            row[title[2]] = player.find('td',{'data-index':'2'}).text.strip()
            row[title[3]] = player.find('td',{'data-index':'3'}).text.strip()
            row[title[4]] = player.find('td',{'data-index':'4'}).text.strip()
            rows.append(row)

In [5]:
data = pd.DataFrame(rows)

date = datetime.now().strftime("%d-%m-%Y %H%M%S") # get local time as string

filePath = f"../FIFA - Output/{url_main.split('/')[-2]} {date} - Webscrape.csv"

data.to_csv(filePath, index=False) # save to file path

pd.set_option('display.max_rows', 100)

data

Unnamed: 0,Country,Name,POS,AGE,HT,WT
0,Argentina,Franco Armani,G,35,"6'2""",194 lbs
1,Argentina,Emiliano Martinez,G,30,"6'4""",196 lbs
2,Argentina,Geronimo Rulli,G,30,"6'2""",185 lbs
3,Argentina,Marcos Acuna,D,30,"5'7""",152 lbs
4,Argentina,Juan Foyth,D,24,"5'10""",152 lbs
...,...,...,...,...,...,...
1378,Wales,Rubin Colwill,F,20,-,-
1379,Wales,Mark Thomas Harris,F,23,-,-
1380,Wales,Kieffer Moore,F,30,"6'5""",182 lbs
1381,Wales,Tyler Roberts,F,23,"5'10""",165 lbs


In [6]:
# Clean table

# TO DO:
# drop duplicated players within a country (ex: Argentina 'Agustin Rossi')

# build dataframe
data = pd.DataFrame(rows)

# drop rows with missing data
data = data[~data.isin(['-']).any(axis=1)]

# replace possition values
# POS_values = {x.text.strip().capitalize() for x in table.find_all('th',{'data-index':'0'})}
# POS_keys = set(data[title[1]].to_list())
# POS_mapped = dict(zip(POS_keys, POS_values))
POS_mapped = {'G': 'Goalkeeper', 'D': 'Defender', 'M': 'Midfielder', 'F': 'Forward'}
data.replace({title[1]: POS_mapped}, inplace=True)

# age column to number
data[title[2]] = pd.to_numeric(data[title[2]])

# hight column to number
data[title[3]] = data[title[3]].apply(lambda x: (int(x.split('\'')[0])*12 + int(x.split('\'')[1].replace('\"',''))) * 2.54) # inches to centimeters

# weight column to number
data[title[4]] = data[title[4]].apply(lambda x: round(int(x.split(' ')[0]) / 2.205,1)) # lbs to kg

data


Unnamed: 0,Country,Name,POS,AGE,HT,WT
0,Argentina,Franco Armani,Goalkeeper,35,187.96,88.0
1,Argentina,Emiliano Martinez,Goalkeeper,30,193.04,88.9
2,Argentina,Geronimo Rulli,Goalkeeper,30,187.96,83.9
3,Argentina,Marcos Acuna,Defender,30,170.18,68.9
4,Argentina,Juan Foyth,Defender,24,177.80,68.9
...,...,...,...,...,...,...
1375,Wales,Jonathan Williams,Midfielder,29,167.64,59.9
1376,Wales,Gareth Bale,Forward,33,182.88,81.6
1377,Wales,Wes Burns,Forward,27,172.72,67.6
1380,Wales,Kieffer Moore,Forward,30,195.58,82.5


In [7]:
# Average per possition
data.groupby('POS').mean().round(1)

  data.groupby('POS').mean().round(1)


Unnamed: 0_level_0,AGE,HT,WT
POS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Defender,27.6,181.5,76.1
Forward,27.4,179.1,74.7
Goalkeeper,29.9,188.1,83.1
Midfielder,26.8,177.2,71.5


In [8]:
data.groupby('Country').mean().round(1)

  data.groupby('Country').mean().round(1)


Unnamed: 0_level_0,AGE,HT,WT
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Argentina,27.2,177.8,74.1
Australia,29.2,180.0,75.9
Belgium,29.1,183.0,76.5
Brazil,27.7,179.9,74.7
Cameroon,28.4,181.4,78.8
Canada,26.4,180.6,75.5
Costa rica,30.6,180.0,75.3
Croatia,27.1,183.3,78.3
Denmark,27.2,183.6,77.6
Ecuador,26.1,176.6,71.6


In [9]:
# which country has the maximun average hight in their golee? 

data1 = data.groupby(['Country','POS']).mean().round(1).reset_index()

data1 = data1[data1.POS == 'Goalkeeper']

# data1[data1.HT == data1.HT.max()]

data1

  data1 = data.groupby(['Country','POS']).mean().round(1).reset_index()


Unnamed: 0,Country,POS,AGE,HT,WT
2,Argentina,Goalkeeper,31.7,189.7,86.9
6,Australia,Goalkeeper,32.6,186.9,83.9
10,Belgium,Goalkeeper,31.3,194.7,86.6
14,Brazil,Goalkeeper,31.0,188.8,88.4
18,Cameroon,Goalkeeper,30.2,184.8,86.2
22,Canada,Goalkeeper,28.0,188.0,80.0
26,Costa rica,Goalkeeper,33.3,183.7,76.8
30,Croatia,Goalkeeper,27.2,193.7,87.5
34,Denmark,Goalkeeper,29.3,188.0,83.7
38,Ecuador,Goalkeeper,35.0,190.5,85.7


In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
from datetime import datetime

url_main = 'https://www.foxsports.com/soccer/2022-fifa-world-cup/teams'
response = requests.get(url_main)
soup = BeautifulSoup(response.text, 'html.parser')
country_name = [x.text for x in soup.find_all('h3')]
countries = [x['href'] for x in soup.find_all('a',{'class':'entity-list-row-container image-logo'})]


# ---------------------------------------- running time: 1m 20s ---------------------------------------- #

rows=[]

for idx,country in enumerate(countries):

    url = f'https://www.foxsports.com{country}-roster'
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    table = soup.find('div',{'view':'team'})
    title = [x.text.strip() for x in table.find_all('tr')[0]]
    
    for group in table.find_all('tbody')[:-1]:
        for player in group:
            row={}
            row['Country'] = country_name[idx].capitalize()
            row['Name'] = player.find('h3').text
            row[title[1]] = player.find('td',{'data-index':'1'}).text.strip()
            row[title[2]] = player.find('td',{'data-index':'2'}).text.strip()
            row[title[3]] = player.find('td',{'data-index':'3'}).text.strip()
            row[title[4]] = player.find('td',{'data-index':'4'}).text.strip()
            rows.append(row)

data = pd.DataFrame(rows)

data = data[~data.isin(['-']).any(axis=1)] # drop rows with missing data

POS_mapped = {'G': 'Goalkeeper', 'D': 'Defender', 'M': 'Midfielder', 'F': 'Forward'}

data.replace({title[1]: POS_mapped}, inplace=True)

data[title[2]] = pd.to_numeric(data[title[2]]) # age column to number

data[title[3]] = data[title[3]].apply(lambda x: (int(x.split('\'')[0])*12 + int(x.split('\'')[1].replace('\"',''))) * 2.54) # inches to centimeters

# weight column to number
data[title[4]] = data[title[4]].apply(lambda x: round(int(x.split(' ')[0]) / 2.205,1)) # lbs to kg

# add BMI column
data['BMI'] = data.apply(lambda x: round(x.WT / (x.HT/100)**2,1) , axis=1)

data

Unnamed: 0,Country,Name,POS,AGE,HT,WT
0,Argentina,Franco Armani,Goalkeeper,35,187.96,88.0
1,Argentina,Emiliano Martinez,Goalkeeper,30,193.04,88.9
2,Argentina,Geronimo Rulli,Goalkeeper,30,187.96,83.9
3,Argentina,Marcos Acuna,Defender,30,170.18,68.9
4,Argentina,Juan Foyth,Defender,24,177.80,68.9
...,...,...,...,...,...,...
1375,Wales,Jonathan Williams,Midfielder,29,167.64,59.9
1376,Wales,Gareth Bale,Forward,33,182.88,81.6
1377,Wales,Wes Burns,Forward,27,172.72,67.6
1380,Wales,Kieffer Moore,Forward,30,195.58,82.5


In [30]:
data.columns

Index(['Country', 'Name', 'POS', 'AGE', 'HT', 'WT'], dtype='object')

In [27]:
# weight (lb) / [height (in)]2 x 703


188 / ((5*12)+11) / ((5*12)+11) * 703

26.217813925808372

In [94]:
data['BMI'] = data.apply(lambda x: round(x.WT / (x.HT/100)**2,1) , axis=1)

In [95]:
data

Unnamed: 0,Country,Name,POS,AGE,HT,WT,BMI
0,Argentina,Franco Armani,Goalkeeper,35,187.96,88.0,24.9
1,Argentina,Emiliano Martinez,Goalkeeper,30,193.04,88.9,23.9
2,Argentina,Geronimo Rulli,Goalkeeper,30,187.96,83.9,23.7
3,Argentina,Marcos Acuna,Defender,30,170.18,68.9,23.8
4,Argentina,Juan Foyth,Defender,24,177.80,68.9,21.8
...,...,...,...,...,...,...,...
1375,Wales,Jonathan Williams,Midfielder,29,167.64,59.9,21.3
1376,Wales,Gareth Bale,Forward,33,182.88,81.6,24.4
1377,Wales,Wes Burns,Forward,27,172.72,67.6,22.7
1380,Wales,Kieffer Moore,Forward,30,195.58,82.5,21.6


In [99]:
data.columns

Index(['Country', 'Name', 'POS', 'AGE', 'HT', 'WT', 'BMI'], dtype='object')

In [98]:
data.BMI

0       275.96
1       281.94
2       271.86
3       239.08
4       246.70
         ...  
1375    227.54
1376    264.48
1377    240.32
1380    278.08
1381    252.60
Length: 1089, dtype: float64

In [139]:
round(data.HT.mean() / data[data.Country == 'Argentina'].HT.mean() - 1, 2)

0.01

In [137]:
data.HT.mean() / data[data.Country == 'Argentina'].HT.mean() - 1

0.014115177751541363