In [1]:
import os, requests, time
from bs4 import BeautifulSoup
import csv

In [2]:
# headers are used as the website rejects GET requests without one
url_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

def get_html(url):    
    re = requests.get(url, headers=url_headers)
    while True:
        if re.status_code != 429:
            break
        else:
            print('waiting...')
            time.sleep(5)
    return BeautifulSoup(re.text, 'html.parser')

In [3]:
# change names of some of the columns
def process_header(header):
    new_columns = {'foot': 'Preferred foot',
                   'Best position': 'Position',
                   'Team & Contract': 'Team', 
                   'Total attacking': 'Attacking',
                   'Total movement': 'Movement',
                   'Total defending': 'Defending',
                   'Total goalkeeping': 'Goalkeeping'}
    for column in new_columns:
        header[header.index(column)] = new_columns[column]
    return header

In [4]:
# process each row of the table
def process_player_row(row, header):
    player = {}
    columns = row.find_all('td')[1:]
    for idx in range(len(columns)):
        if header[idx] == '':
            continue
        player[header[idx]] = columns[idx].text
        if player[header[idx]].isdigit():
            player[header[idx]] = int(player[header[idx]])
        elif header[idx] == 'Overall rating':
            rating = player[header[idx]]
            if '+' in rating:
                rating = rating[:rating.index("+")]
            if '-' in rating:
                rating = rating[:rating.index("-")]
            if rating.isdigit():
                player[header[idx]] = int(rating)
    player['Name'] = columns[header.index('Name')].find("a").text.split("\xa0")[-1]
    player['Nationality'] = columns[header.index('Name')].find("img").get('title')
    player['Team'] = columns[header.index('Team')].find("a").text
    player['Height'] = player['Height'].split("/")[0].strip()
    if player['Team'] == "":
        player['Team'] = "No Team"
    player['Attacking'] = player['Attacking']/5
    player['Movement'] = player['Movement']/5
    player['Defending'] = round(player['Defending']/3, 1)
    player['Goalkeeping'] = player['Goalkeeping']/5
    return player

In [5]:
# parse the page of a team and get its league information
def get_team_info(team_url):
    html_page = get_html(team_url)
    if html_page == None:
        return "Unknown"
    
    # parse the page
    head = html_page.find('div', {'class': 'profile'})
    if head == None:
        return "Free Agent"
    league = head.find_all('a')[-1].text
    country = head.find('a').get('title')
    return "%s (%s)" % (league, country)

In [6]:
# url with all the required columns can be found by going to 'https://sofifa.com',
# adding the columns, and copy/pasting the url of the page.
players_url = "https://sofifa.com/players?showCol%5B%5D=pi&showCol%5B%5D=ae&showCol%5B%5D=hi&showCol%5B%5D=pf&showCol%5B%5D=oa&showCol%5B%5D=bp&showCol%5B%5D=vl&showCol%5B%5D=wg&showCol%5B%5D=ta&showCol%5B%5D=to&showCol%5B%5D=td&showCol%5B%5D=tg"

# store all the players data in a list
players = []

# store the raw data just to be safe
raw_pages = []

# track the data of the first player so we can check if they are read again
first_player = None
first_player_found = False

offset = 0
while True:
    html_page = get_html(players_url+'&offset=%d' % (offset))
    
    # parse the page
    table = html_page.find('table')
    
    # extract the header and preprocess it
    header = process_header([tag.text for tag in table.find('tr').find_all('th')[1:]])
        
    for row in table.find_all('tr')[1:]:
        player = process_player_row(row, header)
        team_url = 'https://sofifa.com' + row.find_all('td')[4].find('a').get('href')
        player['League'] = team_url
            
        # update the first_player if found
        if first_player == None:
            first_player = player
        # stop iterating further if first player of the dataset is encountered again
        elif first_player == player:
            first_player_found = True
            break
        players.append(player)
    
    # if the first player has been encountered again, stop iterating further
    if first_player_found:
        break
    
    # store the page just in case
    raw_pages.append(html_page)
    
    # find the offset by figuring out how many players have already been parsed
    offset = len(players)
    
    if len(raw_pages) % 50 == 0:
        print("%d pages read and %d players parsed" % (len(raw_pages), len(players)))
    
        
len(players)

50 pages read and 3000 players parsed
100 pages read and 6000 players parsed
150 pages read and 9000 players parsed
200 pages read and 11960 players parsed
250 pages read and 14911 players parsed
300 pages read and 17910 players parsed


20010

In [7]:
players[0]

{'Name': 'V. Barco',
 'Age': 18,
 'Overall rating': 73,
 'Team': 'Brighton & Hove Albion',
 'ID': 263370,
 'Height': '172cm',
 'Preferred foot': 'Left',
 'Position': 'CAM',
 'Value': '€7M',
 'Wage': '€19K',
 'Attacking': 54.0,
 'Movement': 77.0,
 'Defending': 68.0,
 'Goalkeeping': 9.8,
 'Nationality': 'Argentina',
 'League': 'https://sofifa.com/team/1808/brighton-hove-albion/'}

In [8]:
# remove duplicates
players_without_duplicates = {}
for player in players:
    players_without_duplicates[player['ID']] = player

In [9]:
# store all team urls in a dict to avoid repeating teams
team_urls = {}

# track progress
team_pages_read = 0

In [10]:
# MANUALLY interrupt and rerun this cell (and only this cell) if it hits 'waiting...'
# otherwise, it will get stuck on 'waiting...'
bad_players = []

for player_id in players_without_duplicates:
    player = players_without_duplicates[player_id]
    team_url = player['League']
    if team_url not in team_urls:
        try:
            team_urls[team_url] = get_team_info(team_url)
        except:
            bad_players.append(player)
            continue
        team_pages_read += 1
        if team_pages_read % 10 == 0:
            print("%d pages read" % (team_pages_read))
            
len(bad_players)

10 pages read
20 pages read
30 pages read
40 pages read
50 pages read
60 pages read
70 pages read
80 pages read
90 pages read
100 pages read
110 pages read
120 pages read
130 pages read
140 pages read
150 pages read
waiting...
waiting...
160 pages read
170 pages read
180 pages read
190 pages read
200 pages read
210 pages read
220 pages read
230 pages read
240 pages read
waiting...
waiting...
250 pages read
260 pages read
270 pages read
280 pages read
290 pages read
300 pages read
310 pages read
320 pages read
330 pages read
340 pages read
waiting...
350 pages read
360 pages read
370 pages read
waiting...
waiting...
380 pages read
390 pages read
400 pages read
410 pages read
420 pages read
430 pages read
440 pages read
waiting...
waiting...
450 pages read
460 pages read
470 pages read
480 pages read
490 pages read
500 pages read
510 pages read
520 pages read
530 pages read
waiting...
540 pages read
550 pages read
560 pages read
waiting...
570 pages read
waiting...
waiting...
580 pages r

23

In [11]:
# do a second pass on teams that got left out last time
really_bad_players = []

for player in bad_players:
    team_url = player['League']
    if team_url not in team_urls:
        try:
            team_urls[team_url] = get_team_info(team_url)
        except:
            really_bad_players.append(player)
            continue
        team_pages_read += 1
        if team_pages_read % 10 == 0:
            print("%d pages read" % (team_pages_read))
            
len(really_bad_players)

0

In [12]:
for player_id in players_without_duplicates:
    league = players_without_duplicates[player_id]['League']
    if league in team_urls:
        players_without_duplicates[player_id]['League'] = team_urls[league]
    else:
        print(league)
        players_without_duplicates[player_id]['League'] = 'Unknown'

In [13]:
for p in list(players_without_duplicates.values()):
    if type(p['Overall rating']) == str:
        print(p)

In [14]:
# sort the players by decreasing order of their 'Overall rating'
sorted_players = sorted(list(players_without_duplicates.values()), key=lambda player: (player['Overall rating'], player['ID']), reverse=True)
sorted_players[0]

{'Name': 'E. Haaland',
 'Age': 22,
 'Overall rating': 91,
 'Team': 'Manchester City',
 'ID': 239085,
 'Height': '195cm',
 'Preferred foot': 'Left',
 'Position': 'ST',
 'Value': '€185M',
 'Wage': '€340K',
 'Attacking': 78.6,
 'Movement': 83.6,
 'Defending': 38.0,
 'Goalkeeping': 10.4,
 'Nationality': 'Norway',
 'League': 'Premier League (England)'}

In [15]:
# change the order of the columns
new_column_order = ['ID', 'Name', 'Age', 'Nationality', 'Team', 'League', 'Value', 'Wage', 'Attacking', 'Movement', 'Defending', 'Goalkeeping', 'Overall rating', 'Position', 'Height', 'Preferred foot']
for idx in range(len(sorted_players)):
    sorted_players[idx] = dict(sorted(sorted_players[idx].items(), key=lambda item: new_column_order.index(item[0])))
    
sorted_players[0]

{'ID': 239085,
 'Name': 'E. Haaland',
 'Age': 22,
 'Nationality': 'Norway',
 'Team': 'Manchester City',
 'League': 'Premier League (England)',
 'Value': '€185M',
 'Wage': '€340K',
 'Attacking': 78.6,
 'Movement': 83.6,
 'Defending': 38.0,
 'Goalkeeping': 10.4,
 'Overall rating': 91,
 'Position': 'ST',
 'Height': '195cm',
 'Preferred foot': 'Left'}

In [19]:
for i in range(len(sorted_players)):
    sorted_players[i]['Name'] = sorted_players[i]['Name'].split("\xa0")[-1]

In [20]:
# write the data into the data file
f = open('soccer_stars.csv', 'w', encoding='utf-8', newline='')
writer = csv.DictWriter(f, sorted_players[0].keys())
writer.writeheader()
writer.writerows(sorted_players)
f.close()

In [21]:
# write the first 50 items into small data file
f = open('small_soccer_stars.csv', 'w', encoding='utf-8', newline='')
writer = csv.DictWriter(f, sorted_players[0].keys())
writer.writeheader()
writer.writerows(sorted_players[:50])
f.close()