In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import sys, getopt
import csv
import os
import time

## Getting URL's of All Season Fixtures

In [2]:
ssn_url = 'https://fbref.com/en/comps/9/3232/schedule/2019-2020-Premier-League-Scores-and-Fixtures'
res = requests.get(ssn_url)
comm = re.compile("<!--|-->")
soup = BeautifulSoup(comm.sub("",res.text),'lxml')

all_tables = soup.findAll("tbody")
target = all_tables[0]
rows = target.find_all('tr')

In [3]:
matches_dict = dict()
urls_dict = dict()
features_wanted = ['squad_a', 'squad_b']

for row in rows:
    cell = row.find("th", {"data-stat": "gameweek"})
    a = cell.text.strip().encode()
    text = a.decode("utf-8")
    if "gameweek" in matches_dict:
        matches_dict["gameweek"].append(text)
    else:
        matches_dict["gameweek"] = [text]
    
    for f in features_wanted:
        cell = row.find("td",{"data-stat": f})
        a = cell.text.strip().encode()
        text=a.decode("utf-8")
        
        if f in matches_dict:
            matches_dict[f].append(text)
        else:
            matches_dict[f] = [text]
    
    cell = row.find("td", {"data-stat": "match_report"})
    for a in cell.find_all('a', href=True):
        if "report_url" in urls_dict:
            urls_dict["report_url"].append('https://fbref.com' + a['href'])
        else:
            urls_dict["report_url"] = ['https://fbref.com' + a['href']]

In [4]:
# Creating a matches DF and cleaning it
matches_df_init = pd.DataFrame.from_dict(matches_dict)
nan_value = float("NaN")
matches_df_init.replace("", nan_value, inplace=True)
matches_df_init.dropna(inplace=True)
matches_df_init.reset_index(inplace=True, drop=True)
len(matches_df_init)

380

In [5]:
# Creating a URL's df and merging it with the matches DF
urls_df = pd.DataFrame.from_dict(urls_dict)
matches_df = matches_df_init.join(urls_df)
len(matches_df)

380

In [8]:
matches_df.to_csv("data/match_urls.csv", index=False)

## Get Game by Game Data For Players 

In [2]:
# load urls
urls = pd.read_csv("data/match_urls.csv")

In [4]:
features_wanted = ['shirtnumber', 'position', 'minutes', 'goals', 'assists',
                   'shots_total', 'shots_on_target', 'cards_yellow', 'cards_red', 'xg', 'xa']

extra_info = ['team', 'opponent', 'gw']

# create empty dataframe to add players to
players_raw = pd.DataFrame(columns=['player'])

for i in features_wanted:
    players_raw[i] = None

for i in extra_info:
    players_raw[i] = None

    
# Get the data
for index, row in urls.iterrows():
    # get the url and keep track of home, away, gw
    home = row['squad_a']
    away = row['squad_b']
    gw = row['gameweek']
    url = row['report_url']
    
    res = requests.get(url)
    time.sleep(5)
    ## The next two lines get around the issue with comments breaking the parsing.
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')
    
    # get index from the html
    home_table_index = 0
    away_table_index = 7
    all_tables = soup.findAll("tbody")
    home_table = all_tables[home_table_index]
    away_table = all_tables[away_table_index]
    
    # get home team data
    home_rows = home_table.find_all('tr')
    home_dict = dict()
    for row in home_rows:
        cell = row.find("th", {"data-stat": "player"})
        a = cell.text.strip().encode()
        text = a.decode("utf-8")
        if "player" in home_dict:
            home_dict["player"].append(text)
        else:
            home_dict["player"] = [text]

        for f in features_wanted:
            cell = row.find("td",{"data-stat": f})
            a = cell.text.strip().encode()
            text=a.decode("utf-8")

            if f in home_dict:
                home_dict[f].append(text)
            else:
                home_dict[f] = [text]
        
        #temp dataframe + append to the main dataframe
        home_df = pd.DataFrame.from_dict(home_dict)
        home_df['team'] = home
        home_df['opponent'] = away
        home_df['gw'] = gw
        
     
    # get away data
    away_rows = away_table.find_all('tr')
    away_dict = dict()
    for row in away_rows:
        cell = row.find("th", {"data-stat": "player"})
        a = cell.text.strip().encode()
        text = a.decode("utf-8")
        if "player" in away_dict:
            away_dict["player"].append(text)
        else:
            away_dict["player"] = [text]

        for f in features_wanted:
            cell = row.find("td",{"data-stat": f})
            a = cell.text.strip().encode()
            text=a.decode("utf-8")

            if f in away_dict:
                away_dict[f].append(text)
            else:
                away_dict[f] = [text]
        
        #temp dataframe + append to the main dataframe
        away_df = pd.DataFrame.from_dict(away_dict)
        away_df['team'] = away
        away_df['opponent'] = home
        away_df['gw'] = gw
        
    # add to the original df
    temp = pd.concat([home_df,away_df]).drop_duplicates().reset_index(drop=True)
    players_raw = players_raw.append(temp).reset_index(drop=True)

In [13]:
players_raw.to_csv("data/players/players_raw.csv", encoding="utf-8-sig", index=False)

In [14]:
players_raw

Unnamed: 0,player,shirtnumber,position,minutes,goals,assists,shots_total,shots_on_target,cards_yellow,cards_red,xg,xa,team,opponent,gw
0,Roberto Firmino,9,FW,85,0,1,4,1,0,0,0.8,0.4,Liverpool,Norwich City,1
1,James Milner,7,"CM,FW",5,0,0,0,0,0,0,0.0,0.0,Liverpool,Norwich City,1
2,Divock Origi,27,LW,73,1,0,1,1,0,0,0.2,0.0,Liverpool,Norwich City,1
3,Sadio Mané,10,LW,17,0,0,0,0,0,0,0.0,0.0,Liverpool,Norwich City,1
4,Mohamed Salah,11,RW,90,1,1,2,1,0,0,0.3,0.2,Liverpool,Norwich City,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10610,Virgil van Dijk,4,CB,90,1,0,2,2,0,0,0.1,0.0,Liverpool,Newcastle Utd,38
10611,Joe Gomez,12,CB,90,0,0,0,0,0,0,0.0,0.0,Liverpool,Newcastle Utd,38
10612,Neco Williams,76,RB,84,0,0,1,0,0,0,0.0,0.0,Liverpool,Newcastle Utd,38
10613,Trent Alexander-Arnold,66,RB,6,0,0,0,0,0,0,0.0,0.0,Liverpool,Newcastle Utd,38
