In [117]:
import urllib.request
import json
import pandas as pd
from time import time
from pprint import pprint


def get_data(website):
    with urllib.request.urlopen(website) as url:
        data = json.loads(url.read().decode())
        
    return data


def get_teams(data):
    #headings to keep
    headings = ['code', 'name', 'short_name', 'strength', 'strength_attack_away', 'strength_attack_home', 'strength_defence_away', 'strength_defence_home', 'strength_overall_away', 'strength_overall_home']
    #extract team data from full dataset
    team_data = data['teams']
    #convert to dataframe
    df_teams_full = pd.DataFrame(team_data)
    #convert strings to numeric values
    df_teams_full = df_teams_full.apply(pd.to_numeric, errors='ignore')
    #remove unwanted columns
    df_teams = df_teams_full[headings]
    
    return df_teams


def get_team_codes(df):
    codes = df['code']
    short_names = df['short_name']
    dict_team_codes = dict(zip(codes, short_names))
        
    return dict_team_codes
    

def get_players(data):
    #headings to keep
    headings = ['id', 'web_name', 'team', 'element_type', 'now_cost', 'total_points', 'minutes', 'PP90', 'PP90PM', 'goals_scored', 'assists', 'clean_sheets', 'bonus', 'bps', 'form', 'goals_conceded', 'yellow_cards', 'red_cards', 'saves', 'penalties_missed', 'penalties_saved', 'team_code']
    #extract player data from full dataset
    player_data = data['elements']
    #convert to dataframe
    df_players_full = pd.DataFrame(player_data)
    #convert strings to numeric values
    df_players_full = df_players_full.apply(pd.to_numeric, errors='ignore')
    #add some extra columns
    df_players_full['now_cost'] /= 10
    df_players_full['PP90'] = (90 * df_players_full['total_points'] / df_players_full['minutes'])
    df_players_full['PP90PM'] = df_players_full['PP90'] / df_players_full['now_cost']
    #remove unwanted columns
    df_players = df_players_full[headings]
    #rename columns
    new_columns = {'element_type': 'position', 'goals_scored': 'goals', 'now_cost': 'price', 'web_name': 'name', 'id': 'fpl_id'}
    df_players = df_players.rename(columns=new_columns)
    
    #change position from number to letter
    '''df_players['element_type'] = df_players['element_type'].astype('category')
    df_players['element_type'].replace([1, 2, 3, 4], ['GK', 'D', 'M', 'F'], inplace=True)
    dict_replace = {'element_type': {1: 'GK', 2: 'D', 3: 'M', 4: 'F'}}
    df = df_players.replace(dict_replace, inplace=True)'''
    
    return df_players
    
    
#function to sort and filter players
def sort_by(df, field, position, team, cost, minutes, team_codes, position_codes):    
    df_sorted =  df.sort_values(by=[field], ascending=False)
    if team != False and position != False:
        df_filtered = df_sorted.loc[(df_sorted['team'] == team) & (df_sorted['position'] == position_codes[position])
                               & (df_sorted['price'] <= cost) & (df_sorted['minutes'] >= minutes)]
        return df_filtered
    elif position != False:
        df_filtered = df_sorted.loc[(df_sorted['position'] == position_codes[position])
                               & (df_sorted['price'] <= cost) & (df_sorted['minutes'] >= minutes)]
        return df_filtered
    elif team != False:
        df_filtered = df_sorted.loc[(df_sorted['team'] == team)
                               & (df_sorted['price'] <= cost) & (df_sorted['minutes'] >= minutes)]
        return df_filtered
    else:
        df_filtered = df_sorted.loc[(df_sorted['price'] <= cost) & (df_sorted['minutes'] >= minutes)]
        
        return df_filtered

         

### start of code, general initialisation ###
start_time = time()
year = 2018
data_path = r"C:\Users\Luke Sefton\Documents\Web scraping\FPL\data\%i-%i" % (year, year+1)
bootstrap_static = "https://fantasy.premierleague.com/drf/bootstrap-static"
all_data = get_data(bootstrap_static)
#pprint(all_data)
df_teams = get_teams(all_data)
df_players = get_players(all_data)
#replace team and position codes with strings
team_id_codes = get_team_codes(df_teams)
df_players['team'] = df_players['team_code'].replace(team_id_codes, inplace=False)
position_codes = dict(zip([1, 2, 3, 4], ['GK', 'D', 'M', 'F']))
df_players['position'] = df_players['position'].replace(position_codes, inplace=False)
    
field = 'price'
position = False
team = False
max_cost = 150
min_minutes = 1
df_players_filtered = sort_by(df_players, field, position, team, max_cost, min_minutes, team_codes, position_codes)

#format columns and round decimals
formatters = {'price': "{:.1f}", 'PP90': '{:.2f}', 'PP90PM': '{:.2f}'}
for header in formatters:
    df_players_filtered[header] = df_players_filtered[header].map(formatters[header].format)
df_players_filtered = df_players_filtered.apply(pd.to_numeric, errors='ignore')

#write dataframe to text file
df_players_filtered.to_csv(r'%s\fpl_players_%i.txt' % (data_path, year), encoding='utf-8-sig', header=True, index = False)

print('Time: %.2fs' % (time()-start_time))

df_players_filtered.style.format({'price': "£{:.1f}m"})

Time: 0.51s


Unnamed: 0,fpl_id,name,team,position,price,total_points,minutes,PP90,PP90PM,goals,assists,clean_sheets,bonus,bps,form,goals_conceded,yellow_cards,red_cards,saves,penalties_missed,penalties_saved,team_code
307,253,Salah,LIV,M,£13.0m,8,86,8.37,0.64,1,0,1,0,20,8,0,0,0,0,0,0,14
440,372,Kane,TOT,F,£12.5m,1,90,1.0,0.08,0,0,0,0,5,1,1,1,0,0,0,0,6
21,23,Aubameyang,ARS,F,£11.0m,2,90,2.0,0.18,0,0,0,0,5,2,2,0,0,0,0,0,3
327,270,Sterling,MCI,M,£11.0m,7,86,7.33,0.67,1,0,1,0,22,7,0,1,0,0,0,0,43
363,306,Lukaku,MUN,F,£11.0m,1,23,3.91,0.36,0,0,0,0,-2,1,1,0,0,0,0,0,1
337,280,Agüero,MCI,F,£11.0m,2,78,2.31,0.21,0,0,1,0,4,2,0,0,0,0,0,0,43
352,295,Sánchez,MUN,M,£10.5m,5,90,5.0,0.48,0,1,0,0,22,5,1,0,0,0,0,0,1
338,281,Jesus,MCI,F,£10.5m,1,11,8.18,0.78,0,0,0,0,1,1,0,0,0,0,0,0,43
136,122,Hazard,CHE,M,£10.5m,4,14,25.71,2.45,0,1,0,0,21,4,0,0,0,0,0,0,8
330,273,De Bruyne,MCI,M,£9.9m,0,30,0.0,0.0,0,0,0,0,0,0,0,1,0,0,0,0,43
