## Getting the URLS

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import sys, getopt
import csv
import os
import time

In [2]:
pl_url = 'https://fbref.com/en/comps/9/3232/2019-2020-Premier-League-Stats'

In [3]:
res = requests.get(pl_url)
## The next two lines get around the issue with comments breaking the parsing.
comm = re.compile("<!--|-->")
soup = BeautifulSoup(comm.sub("",res.text),'lxml')

In [4]:
all_tables = soup.findAll("tbody")

In [5]:
target = all_tables[0]
features_wanted = ['squad']

In [6]:
rows = target.find_all('tr')

teams_urls = dict()

for row in rows:
    for f in features_wanted:
        cell = row.find("td", {"data-stat": f})
        text = cell.text.strip()
        
        if f in teams_urls:
            teams_urls[f].append(text)
        else:
            teams_urls[f] = [text]
        
        for a in cell.find_all('a', href=True):
            if 'url' in teams_urls:
                teams_urls['url'].append('https://fbref.com' + a['href'])
            else:
                teams_urls['url'] = ['https://fbref.com' + a['href']]

In [7]:
teams_urls

{'squad': ['Liverpool',
  'Manchester City',
  'Manchester Utd',
  'Chelsea',
  'Leicester City',
  'Tottenham',
  'Wolves',
  'Arsenal',
  'Sheffield Utd',
  'Burnley',
  'Southampton',
  'Everton',
  'Newcastle Utd',
  'Crystal Palace',
  'Brighton',
  'West Ham',
  'Aston Villa',
  'Bournemouth',
  'Watford',
  'Norwich City'],
 'url': ['https://fbref.com/en/squads/822bd0ba/2019-2020/Liverpool-Stats',
  'https://fbref.com/en/squads/b8fd03ef/2019-2020/Manchester-City-Stats',
  'https://fbref.com/en/squads/19538871/2019-2020/Manchester-United-Stats',
  'https://fbref.com/en/squads/cff3d9bb/2019-2020/Chelsea-Stats',
  'https://fbref.com/en/squads/a2d435b3/2019-2020/Leicester-City-Stats',
  'https://fbref.com/en/squads/361ca564/2019-2020/Tottenham-Hotspur-Stats',
  'https://fbref.com/en/squads/8cec06e1/2019-2020/Wolverhampton-Wanderers-Stats',
  'https://fbref.com/en/squads/18bb7c10/2019-2020/Arsenal-Stats',
  'https://fbref.com/en/squads/1df6b87e/2019-2020/Sheffield-United-Stats',
  'h

## Get The Data for Each Team

In [8]:
teams_df = dict()
i = 0
for url in teams_urls['url']:
    res = requests.get(url)
    time.sleep(10)
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')
    
    all_tables = soup.findAll("tbody")
    results_table = all_tables[1]
    
    pre_df_results = dict()
    features_wanted = ['comp', 'round', 'venue', 'result', 'goals_for', 'goals_against', 'opponent', 'xg_for', 'xg_against']
    
    rows_results = results_table.find_all('tr')
    for row in rows_results:
        for f in features_wanted:
            cell = row.find("td", {"data-stat": f})
            a = cell.text.strip().encode()
            text=a.decode("utf-8")

            if f in pre_df_results:
                pre_df_results[f].append(text)
            else:
                pre_df_results[f] = [text]
    
    teams_df[teams_urls['squad'][i]] = pd.DataFrame.from_dict(pre_df_results)
    i+=1

In [10]:
for team in teams_df:
    teams_df[team] = teams_df[team][teams_df[team]['comp'] == 'Premier League']
    teams_df[team]['gw'] = [int(x[-1]) if len(x) == 11 else int(x[-2:]) for x in teams_df[team]['round']]
    teams_df[team].drop(['round'], axis=1, inplace=True)
    teams_df[team] = teams_df[team].sort_values('gw').reset_index(drop=True)

In [12]:
teams_df['Manchester City']

Unnamed: 0,comp,venue,result,goals_for,goals_against,opponent,xg_for,xg_against,gw
0,Premier League,Away,W,5,0,West Ham,3.0,0.8,1
1,Premier League,Home,D,2,2,Tottenham,2.9,0.2,2
2,Premier League,Away,W,3,1,Bournemouth,2.2,1.4,3
3,Premier League,Home,W,4,0,Brighton,1.9,0.5,4
4,Premier League,Away,L,2,3,Norwich City,2.2,1.6,5
5,Premier League,Home,W,8,0,Watford,5.8,0.3,6
6,Premier League,Away,W,3,1,Everton,3.3,2.4,7
7,Premier League,Home,L,0,2,Wolves,1.4,1.7,8
8,Premier League,Away,W,2,0,Crystal Palace,2.2,0.5,9
9,Premier League,Home,W,3,0,Aston Villa,2.6,0.9,10


In [13]:
# Write the data to csv files
for team in teams_df:
    teams_df[team].to_csv("data/teams/" + team + ".csv", index=False)