# Web Scrape the ESPN Soccer teams
## Author: Pedro Sanhueza
## Website: [www.foxsports.com](https://www.foxsports.com/soccer/2022-fifa-world-cup/teams)

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
from datetime import datetime

In [2]:
url_main = 'https://www.foxsports.com/soccer/2022-fifa-world-cup/teams'
response = requests.get(url_main)
soup = BeautifulSoup(response.text, 'html.parser')

In [3]:
country_name = [x.text for x in soup.find_all('h3')]
countries = [x['href'] for x in soup.find_all('a',{'class':'entity-list-row-container image-logo'})]

In [4]:
country = countries[10]

url = f'https://www.foxsports.com{country}-roster'
soup = BeautifulSoup(requests.get(url).text, 'html.parser')

table = soup.find('div',{'view':'team'})
title = [x.text.strip() for x in table.find_all('tr')[0]]

In [5]:
# running time: 33s -to- 1m 30s-to- 3m 05s

rows=[]
for idx,country in enumerate(countries):

    url = f'https://www.foxsports.com{country}-roster'
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    table = soup.find('div',{'view':'team'})
    title = [x.text.strip() for x in table.find_all('tr')[0]]
    
    for group in table.find_all('tbody')[:-1]:
        for player in group:
            row={}
            row['Country'] = country_name[idx].capitalize()
            row['Name'] = player.find('h3').text
            row[title[1]] = player.find('td',{'data-index':'1'}).text.strip()
            row[title[2]] = player.find('td',{'data-index':'2'}).text.strip()
            row[title[3]] = player.find('td',{'data-index':'3'}).text.strip()
            row[title[4]] = player.find('td',{'data-index':'4'}).text.strip()
            rows.append(row)

In [6]:
data = pd.DataFrame(rows)
data


Unnamed: 0,Country,Name,POS,AGE,HT,WT
0,Argentina,Franco Armani,G,36,"6'2""",194 lbs
1,Argentina,Emiliano Martinez,G,30,"6'4""",196 lbs
2,Argentina,Geronimo Rulli,G,30,"6'2""",185 lbs
3,Argentina,Juan Foyth,D,24,"6'2""",152 lbs
4,Argentina,Lisandro Martinez,D,24,"5'8""",158 lbs
...,...,...,...,...,...,...
826,Wales,Gareth Bale,F,33,"6'0""",180 lbs
827,Wales,Mark Harris,F,23,"5'11""",165 lbs
828,Wales,Daniel James,F,25,"5'7""",167 lbs
829,Wales,Brennan Johnson,F,21,"5'10""",160 lbs


In [7]:
date = datetime.now().strftime("%d-%m-%Y %H%M%S") # get local time as string

filePath = f"../FIFA - Output/{url_main.split('/')[-2]} {date} - Webscrape.csv"

data.to_csv(filePath, index=False) # save to file path

pd.set_option('display.max_rows', 100)

data

Unnamed: 0,Country,Name,POS,AGE,HT,WT
0,Argentina,Franco Armani,G,36,"6'2""",194 lbs
1,Argentina,Emiliano Martinez,G,30,"6'4""",196 lbs
2,Argentina,Geronimo Rulli,G,30,"6'2""",185 lbs
3,Argentina,Juan Foyth,D,24,"6'2""",152 lbs
4,Argentina,Lisandro Martinez,D,24,"5'8""",158 lbs
...,...,...,...,...,...,...
826,Wales,Gareth Bale,F,33,"6'0""",180 lbs
827,Wales,Mark Harris,F,23,"5'11""",165 lbs
828,Wales,Daniel James,F,25,"5'7""",167 lbs
829,Wales,Brennan Johnson,F,21,"5'10""",160 lbs


In [8]:
# Clean table

# TO DO:
# drop duplicated players within a country (ex: Argentina 'Agustin Rossi')

# build dataframe
data = pd.DataFrame(rows)

# drop rows with missing data
data = data[~data.isin(['-']).any(axis=1)]

# replace possition values
# POS_values = {x.text.strip().capitalize() for x in table.find_all('th',{'data-index':'0'})}
# POS_keys = set(data[title[1]].to_list())
# POS_mapped = dict(zip(POS_keys, POS_values))
POS_mapped = {'G': 'Goalkeeper', 'D': 'Defender', 'M': 'Midfielder', 'F': 'Forward'}
data.replace({title[1]: POS_mapped}, inplace=True)

# age column to number
data[title[2]] = pd.to_numeric(data[title[2]])

# hight column to number
data[title[3]] = data[title[3]].apply(lambda x: (int(x.split('\'')[0])*12 + int(x.split('\'')[1].replace('\"',''))) * 2.54) # inches to centimeters

# weight column to number
data[title[4]] = data[title[4]].apply(lambda x: round(int(x.split(' ')[0]) / 2.205,1)) # lbs to kg

data


Unnamed: 0,Country,Name,POS,AGE,HT,WT
0,Argentina,Franco Armani,Goalkeeper,36,187.96,88.0
1,Argentina,Emiliano Martinez,Goalkeeper,30,193.04,88.9
2,Argentina,Geronimo Rulli,Goalkeeper,30,187.96,83.9
3,Argentina,Juan Foyth,Defender,24,187.96,68.9
4,Argentina,Lisandro Martinez,Defender,24,172.72,71.7
...,...,...,...,...,...,...
826,Wales,Gareth Bale,Forward,33,182.88,81.6
827,Wales,Mark Harris,Forward,23,180.34,74.8
828,Wales,Daniel James,Forward,25,170.18,75.7
829,Wales,Brennan Johnson,Forward,21,177.80,72.6


In [11]:
data.to_csv(filePath, index=False) # save to file path

In [9]:
# Average per possition
data.groupby('POS').mean().round(1)

  data.groupby('POS').mean().round(1)


Unnamed: 0_level_0,AGE,HT,WT
POS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Defender,27.0,181.8,76.4
Forward,26.6,179.7,74.7
Goalkeeper,29.6,188.6,82.6
Midfielder,26.3,178.5,72.2


In [10]:
data.groupby('Country').mean().round(1)

  data.groupby('Country').mean().round(1)


Unnamed: 0_level_0,AGE,HT,WT
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Argentina,27.8,177.5,73.5
Australia,27.4,180.2,75.8
Belgium,27.8,183.3,76.6
Brazil,27.9,179.6,74.1
Cameroon,26.3,180.5,77.9
Canada,27.0,180.6,76.3
Costa rica,28.0,181.1,74.9
Croatia,27.4,183.3,76.9
Denmark,27.2,184.5,78.4
Ecuador,25.6,179.3,74.7


In [None]:
# which country has the maximun average hight in their golee? 

data1 = data.groupby(['Country','POS']).mean().round(1).reset_index()

data1 = data1[data1.POS == 'Goalkeeper']

# data1[data1.HT == data1.HT.max()]

data1

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
from datetime import datetime

url_main = 'https://www.foxsports.com/soccer/2022-fifa-world-cup/teams'
response = requests.get(url_main)
soup = BeautifulSoup(response.text, 'html.parser')
country_name = [x.text for x in soup.find_all('h3')]
countries = [x['href'] for x in soup.find_all('a',{'class':'entity-list-row-container image-logo'})]


# ---------------------------------------- running time: 1m 20s ---------------------------------------- #

rows=[]

for idx,country in enumerate(countries):

    url = f'https://www.foxsports.com{country}-roster'
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    # image = soup.find('source')['srcset']
    table = soup.find('div',{'view':'team'})
    title = [x.text.strip() for x in table.find_all('tr')[0]]
    
    for group in table.find_all('tbody')[:-1]:
        for player in group:
            row={}
            row['Country'] = country_name[idx].capitalize()
            row['Name'] = player.find('h3').text
            row[title[1]] = player.find('td',{'data-index':'1'}).text.strip()
            row[title[2]] = player.find('td',{'data-index':'2'}).text.strip()
            row[title[3]] = player.find('td',{'data-index':'3'}).text.strip()
            row[title[4]] = player.find('td',{'data-index':'4'}).text.strip()
            row['Country_logo'] = soup.find('source')['srcset']
            rows.append(row)

data = pd.DataFrame(rows)

data = data[~data.isin(['-']).any(axis=1)] # drop rows with missing data

POS_mapped = {'G': 'Goalkeeper', 'D': 'Defender', 'M': 'Midfielder', 'F': 'Forward'}

data.replace({title[1]: POS_mapped}, inplace=True)

data[title[2]] = pd.to_numeric(data[title[2]]) # age column to number

data[title[3]] = data[title[3]].apply(lambda x: (int(x.split('\'')[0])*12 + int(x.split('\'')[1].replace('\"',''))) * 2.54) # inches to centimeters

# weight column to number
data[title[4]] = data[title[4]].apply(lambda x: round(int(x.split(' ')[0]) / 2.205,1)) # lbs to kg

# add BMI column
data['BMI'] = data.apply(lambda x: round(x.WT / (x.HT/100)**2,1) , axis=1)

data

In [None]:
data.Country_logo.iloc[10]

In [None]:
data.Country_logo.iloc[30]

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
from datetime import datetime

url_main = 'https://www.foxsports.com/soccer/2022-fifa-world-cup/teams'
response = requests.get(url_main)
soup = BeautifulSoup(response.text, 'html.parser')
country_name = [x.text for x in soup.find_all('h3')]
countries = [x['href'] for x in soup.find_all('a',{'class':'entity-list-row-container image-logo'})]

In [None]:
rows=[]

for idx,country in enumerate(countries):

    url = f'https://www.foxsports.com{country}-roster'
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    image = soup.find('source')['srcset']
    table = soup.find('div',{'view':'team'})
    title = [x.text.strip() for x in table.find_all('tr')[0]]
    
    for group in table.find_all('tbody')[:-1]:
        for player in group:
            row={}
            row['Country'] = country_name[idx].capitalize()
            row['Name'] = player.find('h3').text
            row[title[1]] = player.find('td',{'data-index':'1'}).text.strip()
            row[title[2]] = player.find('td',{'data-index':'2'}).text.strip()
            row[title[3]] = player.find('td',{'data-index':'3'}).text.strip()
            row[title[4]] = player.find('td',{'data-index':'4'}).text.strip()
            row['Country_logo'] = soup.find('source')['srcset']
            rows.append(row)

data = pd.DataFrame(rows)

In [None]:
data