In [None]:
'''from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import json
from time import time
from pprint import pprint


def decode_names(df):
    decoded_names = []
    player_names = df['player_name']
    for n in range(len(player_names)):
        name = player_names[n]
        #replace apostrophe from HTML (N'Golo)
        if '&#039;' in name:
            name = name.replace("&#039;", "\'")
        #replace unicode accents and that
        if 'u00' in name:
            name = name.encode('latin-1').decode('unicode_escape')
        decoded_names.append(name)
    df['full_name'] = decoded_names
    
    return df


def get_headings(subject):
    template = subject[0]
    headings = []
    for element in template:
        heading = ''
        for char in str(element):
            if char == ':':
                break
            elif char != '\"':
                heading += char
        headings.append(heading)
        
    return headings


def get_values(subject):
    values = []
    for element in subject:
        value = ''
        initial = True
        for char in str(element):
            if char == ':':
                initial = False
            elif initial == False and char != '\"':
                value += char
        values.append(value)
        
    return values


def get_understat_players(url):
    page = urlopen(url)
    soup = BeautifulSoup(page, "html.parser")
    scripts = str(soup).split("script>")
    players = []
    for script in scripts:
        if 'var playersData' in script:
            elements = script[32:-5].encode('latin-1').decode('unicode_escape')
            elements = elements.split("{")[1:]
            for element in elements:
                player = element.split(",")
                player[-2] = player[-2][:-2] + '\"'
                if player[-1] == '':
                    del player[-1]
                else:
                    player[-1] = player[-1][:-3]
                #get rid of multiple teams played for
                for p in range(len(player)):
                    if len(player) > 18 and player[p][0] != '\"':
                        player[p-1] = '\"team_title\":\"' + player[p]
                        del player[p]
                        break
                players.append(player)
                
    return players



### start of code, general initialisation ###
start_time = time()
league = 'EPL'
year = 2018
understat = "https://understat.com/league/%s/%s" % (league, year)
players = get_understat_players(understat)
player_headings = get_headings(players)

player_values = []
for i in range(len(players)):
    player_values.append(get_values(players[i]))

#make player data into dataframe
df_players_full = pd.DataFrame(player_values, columns=player_headings)
#add column for decoded names
df_players_full = decode_names(df_players_full)
#convert numbers to appropriate numeric values
df_players_full = df_players_full.apply(pd.to_numeric, errors='ignore')
df_players_full['xG90'] = (90 * df_players_full['xG'] / df_players_full['time'])
df_players_full['xA90'] = (90 * df_players_full['xA'] / df_players_full['time'])
#drop unneeded columns
keep_player_headings = ['full_name', 'goals', 'xG', 'xG90', 'assists', 'xA', 'xA90', 'time', 'shots', 'key_passes',
                       'position', 'team_title']
df_players = df_players_full[keep_player_headings]


print("Time: %.3f s" % (time()-start_time))'''

In [78]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import json
from time import time
from pprint import pprint


def get_teams_json(soup):
    scripts = soup.split("script>")
    for script in scripts:
        if 'var teamsData' in script:
            json_script = script[30:-6].encode('latin-1').decode('unicode_escape')
            data = json.loads(json_script)
    
    return data

        
def get_players_json(soup, team_abbrevs):
    scripts = soup.split("script>")
    for script in scripts:
        #find json script and load it as a dictionary
        if 'var playersData' in script:
            json_script = script[32:-6].encode('latin-1').decode('unicode_escape')
            data = json.loads(json_script)
            #replace team names with abbreviations
            for p in range(len(data)):
                data[p]['team_title'] = team_abbrevs[data[p]['team_title']]
                
    #make player data into dataframe
    df_players_full = pd.DataFrame(data)
    #convert numbers to appropriate numeric values
    df_players_full = df_players_full.apply(pd.to_numeric, errors='ignore')
    #add couple of extra columns
    df_players_full['xG90'] = (90 * df_players_full['xG'] / df_players_full['time'])
    df_players_full['xA90'] = (90 * df_players_full['xA'] / df_players_full['time'])
    #drop unneeded columns
    keep_player_headings = ['id', 'player_name', 'team_title', 'position', 'goals', 'xG', 'xG90', 'assists', 'xA', 'xA90', 'time', 'shots', 'key_passes']
    df_players = df_players_full
    #rename columns
    new_columns = {'team_title': 'team', 'time': 'minutes', 'player_name': 'name', 'id': 'understat_id'}
    df_players = df_players.rename(columns=new_columns)
                
    return df_players


#function to sort and filter players
def sort_by(df, field, position, team, minutes):    
    df_sorted =  df.sort_values(by=[field], ascending=False)
    if team != False and position != False:
        df_filtered = df_sorted.loc[(df_sorted['team'] == team) & (df_sorted['position'].str.contains(position))
                               & (df_sorted['minutes'] >= minutes)]
        return df_filtered
    elif position != False:
        df_filtered = df_sorted.loc[(df_sorted['position'].str.contains(position)) & (df_sorted['minutes'] >= minutes)]
        return df_filtered
    else:
        df_filtered = df_sorted.loc[(df_sorted['minutes'] >= minutes)]
        
        return df_filtered


### start of code, general initialisation ###
start_time = time()
league = 'EPL'
year = 2018
data_path = r"C:\Users\Luke Sefton\Documents\Web scraping\FPL\data\%i-%i" % (year, year+1)
understat_url = "https://understat.com/league/%s/%s" % (league, year)
epl_team_abbrevs = {'Arsenal': 'ARS', 'Bournemouth': 'BOU', 'Brighton': 'BHA', 'Burnley': 'BUR', 'Cardiff': 'CAR', 'Chelsea': 'CHE', 'Crystal Palace': 'CRY', 'Everton': 'EVE', 'Fulham': 'FUL', 'Huddersfield': 'HUD', 'Leicester': 'LEI', 'Liverpool': 'LIV', 'Manchester City': 'MCI', 'Manchester United': 'MUN', 'Newcastle United': 'NEW', 'Southampton': 'SOU', 'Tottenham': 'TOT', 'Watford': 'WAT', 'West Ham': 'WHU', 'Wolverhampton Wanderers': 'WOL'}
soup = BeautifulSoup(urlopen(understat_url), "html.parser")
soup = str(soup).replace("&#039;", "\'")
#teams_json = get_teams_json(soup)
df_players = get_players_json(soup, epl_team_abbrevs)

#filter dataframe
field = 'goals'
position = False
team = False
min_minutes = 0
df_players_filtered = sort_by(df_players, field, position, team, min_minutes)

#format columns and round decimals
formatters = {'xG': "{:.2f}", 'xA': '{:.2f}', 'npxG': '{:.2f}', 'xGChain': '{:.2f}', 'xGBuildup': '{:.2f}',
                        'xG90': "{:.2f}", 'xA90': "{:.2f}"}
for header in formatters:
    df_players_filtered[header] = df_players_filtered[header].map(formatters[header].format)
df_players_filtered = df_players_filtered.apply(pd.to_numeric, errors='ignore')
    
#write dataframe to text file
df_players_filtered.to_csv(r'%s\understat_%s_%i.txt' % (data_path, league, year), encoding='utf-8-sig', header=True, index = False)

print("Time: %.2f s" % (time()-start_time))

#df_players_filtered.style.format(df_format)
df_players_filtered

Time: 1.18 s


Unnamed: 0,assists,games,goals,understat_id,key_passes,npg,npxG,name,position,red_cards,shots,team,minutes,xA,xG,xGBuildup,xGChain,yellow_cards,xG90,xA90
0,0,1,2,838,2,2,0.96,Sadio Mané,F,0,4,LIV,84,0.12,0.96,0.80,1.82,0,1.03,0.13
2,0,1,2,6026,0,2,0.60,Richarlison,M,0,2,EVE,89,0.00,0.60,0.00,0.06,1,0.61,0.00
1,0,1,2,1723,1,2,0.27,Roberto Pereyra,M,0,4,WAT,89,0.32,0.27,0.16,0.71,0,0.27,0.33
12,0,1,1,757,0,1,0.10,Jeffrey Schlupp,M,0,1,CRY,90,0.00,0.10,0.07,0.18,1,0.10,0.00
21,1,1,1,6853,1,1,0.14,Rúben Neves,M,0,3,WOL,90,0.27,0.14,0.36,0.65,0,0.14,0.27
20,0,1,1,4105,1,1,0.63,Raúl Jiménez,F,0,3,WOL,90,0.02,0.63,0.00,0.65,0,0.63,0.02
19,0,1,1,3635,0,1,0.11,Bernardo Silva,M,0,1,MCI,90,0.00,0.11,0.69,0.79,0,0.11,0.00
17,0,1,1,1683,3,1,0.55,Ryan Fraser,M,0,2,BOU,90,0.47,0.55,0.00,0.76,0,0.55,0.47
16,0,1,1,1389,0,0,0.00,Jorginho,M,0,1,CHE,90,0.00,0.76,0.05,0.05,0,0.76,0.00
15,0,1,1,1250,2,1,1.55,Mohamed Salah,F,0,4,LIV,89,0.13,1.55,0.00,1.61,0,1.57,0.13
