# Web Scrape the ESPN Soccer teams
## Author: Pedro Sanhueza
## Website: [www.foxsports.com](https://www.foxsports.com/soccer/2022-fifa-world-cup/teams)

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
from datetime import datetime

In [2]:
url_main = 'https://www.foxsports.com/soccer/2022-fifa-world-cup/teams'
response = requests.get(url_main)
soup = BeautifulSoup(response.text, 'html.parser')

In [3]:
country_name = [x.text for x in soup.find_all('h3')]
countries = [x['href'] for x in soup.find_all('a',{'class':'entity-list-row-container image-logo'})]

In [4]:
# running time: 1m 20s

rows=[]
for idx,country in enumerate(countries):

    url = f'https://www.foxsports.com{country}-roster'
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    table = soup.find('div',{'view':'team'})
    title = [x.text.strip() for x in table.find_all('tr')[0]]
    
    for group in table.find_all('tbody')[:-1]:
        for player in group:
            row={}
            row['Country'] = country_name[idx].capitalize()
            row['Name'] = player.find('h3').text
            row[title[1]] = player.find('td',{'data-index':'1'}).text.strip()
            row[title[2]] = player.find('td',{'data-index':'2'}).text.strip()
            row[title[3]] = player.find('td',{'data-index':'3'}).text.strip()
            row[title[4]] = player.find('td',{'data-index':'4'}).text.strip()
            rows.append(row)

In [5]:
data = pd.DataFrame(rows)

date = datetime.now().strftime("%d-%m-%Y %H%M%S") # get local time as string

filePath = f"../FIFA - Output/{url_main.split('/')[-2]} {date} - Webscrape.csv"

data.to_csv(filePath, index=False) # save to file path

pd.set_option('display.max_rows', 100)

data

Unnamed: 0,Country,Name,POS,AGE,HT,WT
0,Argentina,Esteban Andrada,G,31,"6'4""",182 lbs
1,Argentina,Franco Armani,G,35,"6'2""",194 lbs
2,Argentina,Federico Gomes Gerth,G,18,-,-
3,Argentina,Jeremias Ledesma,G,29,"6'0""",182 lbs
4,Argentina,Agustin Marchesin,G,34,"6'2""",182 lbs
...,...,...,...,...,...,...
1703,Wales,Rabbi Matondo,F,21,"5'8""",145 lbs
1704,Wales,Kieffer Moore,F,29,"6'4""",182 lbs
1705,Wales,Tyler Roberts,F,23,"5'10""",165 lbs
1706,Wales,Sorba Thomas,F,22,"6'0""",-


In [6]:
# Clean table

# TO DO:
# drop duplicated players within a country (ex: Argentina 'Agustin Rossi')

# build dataframe
data = pd.DataFrame(rows)

# drop rows with missing data
data = data[~data.isin(['-']).any(axis=1)]

# replace possition values
# POS_values = {x.text.strip().capitalize() for x in table.find_all('th',{'data-index':'0'})}
# POS_keys = set(data[title[1]].to_list())
# POS_mapped = dict(zip(POS_keys, POS_values))
POS_mapped = {'G': 'Goalkeeper', 'D': 'Defender', 'M': 'Midfielder', 'F': 'Forward'}
data.replace({title[1]: POS_mapped}, inplace=True)

# age column to number
data[title[2]] = pd.to_numeric(data[title[2]])

# hight column to number
data[title[3]] = data[title[3]].apply(lambda x: (int(x.split('\'')[0])*12 + int(x.split('\'')[1].replace('\"',''))) * 2.54) # inches to centimeters

# weight column to number
data[title[4]] = data[title[4]].apply(lambda x: round(int(x.split(' ')[0]) / 2.205,1)) # lbs to kg

data


Unnamed: 0,Country,Name,POS,AGE,HT,WT
0,Argentina,Esteban Andrada,Goalkeeper,31,193.04,82.5
1,Argentina,Franco Armani,Goalkeeper,35,187.96,88.0
3,Argentina,Jeremias Ledesma,Goalkeeper,29,182.88,82.5
4,Argentina,Agustin Marchesin,Goalkeeper,34,187.96,82.5
5,Argentina,Emiliano Martinez,Goalkeeper,29,193.04,88.9
...,...,...,...,...,...,...
1701,Wales,Wes Burns,Forward,27,172.72,67.6
1703,Wales,Rabbi Matondo,Forward,21,172.72,65.8
1704,Wales,Kieffer Moore,Forward,29,193.04,82.5
1705,Wales,Tyler Roberts,Forward,23,177.80,74.8


In [7]:
# Average per possition
data.groupby('POS').mean().round(1)

Unnamed: 0_level_0,AGE,HT,WT
POS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Defender,27.4,181.7,76.1
Forward,27.1,179.0,74.8
Goalkeeper,29.7,187.9,82.6
Midfielder,26.7,177.2,71.5


In [8]:
data.groupby('Country').mean().round(1)

Unnamed: 0_level_0,AGE,HT,WT
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Argentina,27.3,179.2,75.6
Australia,29.0,180.0,75.9
Belgium,27.8,182.4,76.6
Brazil,27.9,179.4,75.2
Cameroon,28.2,181.4,78.8
Canada,26.3,180.6,75.5
Costa rica,30.4,180.0,75.3
Croatia,26.9,183.9,78.8
Denmark,26.7,184.2,77.9
Ecuador,28.1,176.6,72.2


In [9]:
# which country has the maximun average hight in their golee? 

data1 = data.groupby(['Country','POS']).mean().round(1).reset_index()

data1 = data1[data1.POS == 'Goalkeeper']

# data1[data1.HT == data1.HT.max()]

data1

Unnamed: 0,Country,POS,AGE,HT,WT
2,Argentina,Goalkeeper,30.2,189.5,87.0
6,Australia,Goalkeeper,32.4,186.9,83.9
10,Belgium,Goalkeeper,30.2,190.9,82.1
14,Brazil,Goalkeeper,30.0,188.8,86.3
18,Cameroon,Goalkeeper,30.2,184.8,86.2
22,Canada,Goalkeeper,28.0,188.0,80.0
26,Costa rica,Goalkeeper,33.3,183.7,76.8
30,Croatia,Goalkeeper,27.2,191.3,86.1
34,Denmark,Goalkeeper,28.2,189.2,83.3
38,Ecuador,Goalkeeper,30.6,186.5,80.5
