In [1]:
import re
import requests
import pandas as pd
import codecs
import json
from urllib.request import urlopen
from bs4 import BeautifulSoup

# Understat data 

Here we iterate through the `<script>` tags on the understat page (which store all the data), until we get to the `playersData` variable, from here we isolate the data (using a regex) before decoding the JSON data and converting it to a dataframe.

Credit to [this reddit post](https://www.reddit.com/r/FantasyPL/comments/b3e3lg/a_python_package_for_understat/) for laying out how to scrape this data.

In [2]:
player_data = pd.DataFrame()

base_url = url = "https://understat.com/league/"

# different extensions for different leagues 
# (this will help us capture players that moved to PL from another big european League)
# note the new league had started for Ligue 1 when running this
leagues = ['EPL', 'La_liga', 'Bundesliga', 'Serie_A', 'Ligue_1']

for league in leagues:
    print(f"Getting data for {league}")
    response = requests.get(f"{url}{league}/2019")

    soup = BeautifulSoup(response.text, "html.parser")
    script_tags = soup.findAll('script')

    # get the playerData
    for tag in script_tags:
        jason_data_regex = r"(JSON.parse\(')(.*)('\);)"
        if  tag.contents != []:
            if "var playersData" in tag.contents[0]:
                match = re.findall(jason_data_regex, tag.contents[0])
                byte_data = codecs.escape_decode(match[0][1])
                json_data = json.loads(byte_data[0].decode("utf-8"))

                league_data = pd.DataFrame(json_data)
                league_data['league'] = league
                player_data = pd.concat([player_data, league_data])

player_data.to_pickle('data/understat_player_data.pkl')
print('Done!')

Getting data for EPL
Getting data for La_liga
Getting data for Bundesliga
Getting data for Serie_A
Getting data for Ligue_1
Done!


In [3]:
player_data.head()

Unnamed: 0,id,player_name,games,time,goals,xG,assists,xA,shots,key_passes,yellow_cards,red_cards,position,team_title,npg,npxG,xGChain,xGBuildup,league
0,755,Jamie Vardy,35,3034,23,18.903537318110462,5,6.3682975601404905,89,32,3,0,F S,Leicester,19,15.097693115472794,21.02660731226206,1.7243406660854816,EPL
1,318,Pierre-Emerick Aubameyang,36,3143,22,16.352623080834746,3,4.492486916482449,93,26,3,1,F M S,Arsenal,20,14.830358987674115,19.96428203582764,5.339657470583916,EPL
2,986,Danny Ings,38,2836,22,15.659717170521615,2,2.8490850934758782,93,35,3,0,F M S,Southampton,21,14.137379484251142,18.48803149908781,5.015938125550747,EPL
3,618,Raheem Sterling,33,2678,20,19.799906481057405,1,7.208586284890771,100,48,5,0,F M S,Manchester City,20,18.27756874635816,31.4420103430748,10.185997404158115,EPL
4,1250,Mohamed Salah,34,2904,19,20.66331870108843,10,8.72604252398014,132,60,1,0,F S,Liverpool,16,18.37981212884188,31.37419793009758,8.42502685263753,EPL


# Fantasy football data 

nothing too mad here, we simply pull the `players_raw.csv` data from [vastav's Fantasy-Premier-League repo](https://github.com/vaastav/Fantasy-Premier-League).

In [4]:
url = "https://github.com/vaastav/Fantasy-Premier-League/blob/master/data/2020-21/players_raw.csv"
url = f"{url}?raw=true" # notice that we need to add this to download as CSV

fantasy_football_df = pd.read_csv(url, index_col=0)
fantasy_football_df.to_pickle('data/fantasy_football_df.pkl')
fantasy_football_df.head()

Unnamed: 0_level_0,bonus,bps,chance_of_playing_next_round,chance_of_playing_this_round,clean_sheets,code,cost_change_event,cost_change_event_fall,cost_change_start,cost_change_start_fall,...,threat_rank_type,total_points,transfers_in,transfers_in_event,transfers_out,transfers_out_event,value_form,value_season,web_name,yellow_cards
assists,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1,256,,,5,37605,0,0,0,0,...,95,53,0,0,0,0,0.0,7.6,Özil,1
0,5,305,,,4,39476,0,0,0,0,...,64,57,0,0,0,0,0.0,11.4,Sokratis,6
1,10,494,,,8,41270,0,0,0,0,...,32,94,0,0,0,0,0.0,17.1,David Luiz,5
5,37,807,,,10,54694,0,0,0,0,...,3,205,0,0,0,0,0.0,17.1,Aubameyang,3
1,3,286,,,4,58822,0,0,0,0,...,63,61,0,0,0,0,0.0,12.2,Cédric,1
