In [1]:
import pandas as pd

from bs4 import BeautifulSoup

import requests

import re

In [2]:
# Get basic information for all players on the main page

base_url = "https://sofifa.com/players?offset="

In [3]:
# Lists - Basic Information of Players

pid = []

name = []

age = []

photo = []

nationality = []

flag = []

overall = []

potential = []

club = []

club_logo = []

value = []

wage = []

special = []

In [4]:
# offset - a value to load new players for each page

offset = 0

while offset != 'Done':
    
    source_code = requests.get(base_url + str(offset))
    
    soup = BeautifulSoup(source_code.content, 'html.parser')
    
    # Number of players in the current page
    
    table_body = soup.find('tbody')
    
    # Extracting basic information for each player
    
    for delta in table_body.find_all('tr'):
        
        td = delta.find_all('td')
    
        pid.append(td[0].find('img').get('id'))

        name.append(td[1].find("a").get("data-tooltip"))

        age.append(td[2].text)

        photo.append(td[0].find('img').get('data-src'))

        nationality.append(td[1].find('img').get('title'))
        
        flag.append(td[1].find('img').get('data-src'))

        overall.append(td[3].text.strip())

        potential.append(td[4].text.strip())

        club.append(td[5].find('a').text)

        club_logo.append(td[5].find('img').get('data-src'))
        
        value.append(td[6].text.strip().replace('€', ''))

        wage.append(td[7].text.strip().replace('€', ''))

        special.append(td[8].text.strip())
        
    # 60 new players added in the next page    
        
    if soup.find_all("span", {"class": ["bp3-button-text"]})[-1].text == 'Next':
        
        offset += 60
        
    else:
        
        offset = 'Done'
        
    print(offset)

60
120
180
240
300
Done


In [5]:
# Create a DataFrame containing basic information

data = pd.DataFrame({'ID': pid, 'Name': name, 'Age': age, 'Photo': photo, 'Nationality': nationality, 
                     'Flag': flag, 'Overall': overall, 'Potential': potential, 'Club': club, 'Club Logo': club_logo,
                     'Value (€)': value, 'Wage (€)': wage, 'Special': special}, index = range(len(pid)) )

In [6]:
# Get detailed player information from player page via unique player ID

player_data_url = 'https://sofifa.com/player/'

In [7]:
for i in range(len(data.ID)):
    
    skill_map = {}
    
    source_code = requests.get(player_data_url + str(data['ID'][i]))
    
    soup = BeautifulSoup(source_code.content, 'html.parser')
    
    # Extracting profile information
    
    columns12 = soup.find("div", {"class":"columns"}).find_all("div",{"class":"column col-12"})

    for column in columns12:
        
        skills = column.find_all('li')
        
        for skill in skills:
            
            if(skill.find('label') != None):
                
                label = skill.find('label').text
                
                value = skill.text.replace(label, '').replace(' ★', '').strip()
                
                skill_map[label] = value
                
    skill_map["Height (ft)"] = soup.find('div', {'class': 'meta'}).text.split(' ')[-2].replace('"', '')
    
    skill_map['Weight (lbs)'] = soup.find('div', {'class': 'meta'}).text.split(' ')[-1].replace('lbs', '')
    
    # Number of Likes - Number of Dislikes - Number of Follows
    
    skill_map["Likes"] = ''.join(re.findall('[0-9]', columns12[3].text.replace('\n\nLayout 1\n2\n3\n\n\n', '').split('\n')[0]))
    
    skill_map["Dislikes"] = ''.join(re.findall('[0-9]', columns12[3].text.replace('\n\nLayout 1\n2\n3\n\n\n', '').split('\n')[1]))
    
    skill_map["Following"] = ''.join(re.findall('[0-9]', columns12[3].text.replace('\n\nLayout 1\n2\n3\n\n\n', '').split('\n')[2]))
    
    # Player Features
    
    columns3 = soup.find_all("div", {"class":"column col-3"})
    
    for s in range(len(columns3)):
        
        feature_lists = ['\n\nAttacking\n\n', '\n\nSkill\n\n', '\n\nMovement\n\n', '\n\nPower\n\n', '\n\nMentality\n\n',
                           '\n\nDefending\n\n', '\n\nGoalkeeping\n\n']

        if any(t for t in feature_lists if t in columns3[s].text):

            for li in columns3[s].find_all("li"):

                text = li.text

                skill_map[text[2:].strip(" ").rstrip()] = text[:2].strip(" ").rstrip() 
        
    for key, value in skill_map.items():
        
        data.loc[i, key] = value    
        
    print("Loading: " + str(i + 1) + "/" + str(data.shape[0]), end = "\r")

Loading: 360/360

In [8]:
# Dropping unnecessary columns which have many null values

thresh = len(data) * 0.05

data.dropna(thresh = thresh, axis = 1, inplace = True)

data.head(5)

Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,...,Composure,Defensive Awareness,Standing Tackle,Sliding Tackle,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes,Loaned From
0,227813,Oleksandr Zinchenko,23,https://cdn.sofifa.com/players/227/813/21_60.png,Ukraine,https://cdn.sofifa.com/flags/ua.png,80,85,Manchester City,https://cdn.sofifa.com/teams/10/30.png,...,76.0,78.0,79,76,15,11,13,11,11,
1,239818,Rúben Santos Gato Alves Dias,23,https://cdn.sofifa.com/players/239/818/21_60.png,Portugal,https://cdn.sofifa.com/flags/pt.png,84,89,Manchester City,https://cdn.sofifa.com/teams/10/30.png,...,81.0,87.0,87,83,7,8,13,7,12,
2,237692,Phil Foden,20,https://cdn.sofifa.com/players/237/692/21_60.png,England,https://cdn.sofifa.com/flags/gb-eng.png,82,92,Manchester City,https://cdn.sofifa.com/teams/10/30.png,...,81.0,58.0,59,47,8,15,6,10,13,
3,204485,Riyad Mahrez,29,https://cdn.sofifa.com/players/204/485/21_60.png,Algeria,https://cdn.sofifa.com/flags/dz.png,85,85,Manchester City,https://cdn.sofifa.com/teams/10/30.png,...,84.0,45.0,31,22,15,9,13,11,6,
4,12265,Owen Hargreaves,30,https://cdn.sofifa.com/players/012/265/12_60.png,England,https://cdn.sofifa.com/flags/gb-eng.png,78,78,Manchester City,https://cdn.sofifa.com/teams/10/30.png,...,,,78,75,14,9,12,15,12,


In [9]:
data.to_excel('./Dataset/Dataset-FIFA-21.xlsx', index = False)