In [1]:
import sqlite3
from pprint import pprint
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import logging
from tqdm.notebook import tqdm
from lxml import etree
from io import StringIO
from urllib.parse import urljoin


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.common.exceptions import TimeoutException

In [13]:
class FBRefDriver():
    """Custom web driver for FBRef.com"""

    def __init__(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--blink-settings=imagesEnabled=false')

        self.driver = webdriver.Chrome(options=chrome_options)
        self.base_url = "https://fbref.com"
        self.last_visit = 0

    def absolute_url(self, relative_url):
        return urljoin(self.base_url, relative_url)
        
    def __enter__(self): return self

    def __exit__(self, exc_type, exc_value, exc_traceback): self.driver.close()

    def get(self, url):
        # One request every three seconds
        delay = 3
        time_since_last_visit = time.time() - self.last_visit
        if time_since_last_visit <= delay:
            logging.info(f"sleep for {delay-time_since_last_visit}")
            time.sleep(delay-time_since_last_visit)
        self.last_visit = time.time()

        res = self.driver.get(url)
        return res

    def get_tree_by_id(self, id):
        for attempt in range(3):
            try:
                element = WebDriverWait(self.driver, timeout=30).until(
                    ec.presence_of_element_located((By.ID, id))
                )
                parser = etree.HTMLParser()
                tree = etree.parse(StringIO(element.get_attribute('innerHTML')), parser)
                return tree
            except TimeoutException as e:
                logging.warning("Webdriver Timeout. Retrying...")
                self.driver.refresh()
                continue
        raise Exception("Failed 3 retries!")

    def get_table_text(self, cell):
        if list(cell) == []:
            return cell.text
        else:
            if cell.xpath("./a") != []:
                return cell.xpath("./a")[0].text
            if cell.xpath('./span[@class="venuetime"]') != []:
                return cell.xpath('./span[@class="venuetime"]')[0].text
            if cell.xpath('./small') != []:
                return cell.text
            return cell.text
        
        
    def get_table_df_by_id(self, id):
        tree = self.get_tree_by_id(id)

        columns = tree.xpath('*/thead/tr[not (@class)]/th')
        columns = [c.text for c in columns]
        rows = tree.xpath('*/tbody/tr[not (@class)]')
        content = []
        for tr in rows:
            td = tr.xpath(".//*[self::th or self::td]")
            # get last child element's text
            td = [self.get_table_text(d) for d in td] 
            content.append(td)
        return pd.DataFrame(content, columns=columns)


    def get_team_season_links(self, season):
        relative_url = f"/en/comps/9/{season}/{season}-Premier-League-Stats"
        url = self.absolute_url(relative_url)
        logging.info(f"Crawling Season Result:\t{season}\t{url}")
        self.get(url)
        tree = self.get_tree_by_id(f'results{season}91_overall')
        links = tree.xpath('*/tbody/tr/td[1]/a')
        return [(l.text, self.absolute_url(l.get("href"))) for l in links]


    def get_player_season_links(self, season):
        relative_url = f"/en/comps/9/{season}/stats/{season}-Premier-League-Stats"
        url = self.absolute_url(relative_url)
        logging.info(f"Crawling Season Result:\t{season}\t{url}")
        self.get(url)
        tree = self.get_tree_by_id(f'stats_standard')
        rows = tree.xpath('*/tbody/tr[not(@class)]')
        player_season_links = []
        for r in rows:
            player = r.xpath('./td[1]/a')[0].text
            pos = r.xpath('./td[3]')[0].text
            relative_url = r.xpath('./td[last()]/a')[0].get("href")
            
            if pos == 'GK':
                relative_url = relative_url.replace("summary", "keeper")
            player_season_links.append((player, pos, self.absolute_url(relative_url)))
        return player_season_links

    def get_team_match_log(self, season, team, link):
        logging.info(f"Crawling Match Log:\t{season} {team}\t{link}")
        self.get(link)
        match_log_df = self.get_table_df_by_id('matchlogs_for')

        match_log_df = match_log_df.drop(['Attendance', 'Captain', 'Formation', 'Referee', 'Match Report', 'Notes'], axis=1)
        match_log_df[['GF', 'GA']] = match_log_df[['GF', 'GA']].stack().str.replace(" \(\d+\)", "").unstack()
        numeric_columns = [c for c in match_log_df.columns if c in ['GF', 'GA', 'xG', 'xGA', 'Poss']]
        match_log_df[numeric_columns] = match_log_df[numeric_columns].apply(pd.to_numeric)
        match_log_df["Season"] = season
        match_log_df["Team"] = team
        match_log_df["Link"] = link
        
        return match_log_df

    def get_player_match_log(self, season, player, pos, link):
        logging.info(f"Crawling Match Log:\t{season} {player}\t{link}")
        self.get(link)
        match_log_df = self.get_table_df_by_id('matchlogs_all')

        useless_cols = [c for c in match_log_df.columns if c in [
            "CrdY", "CrdR", "Press", "Tkl", "Int", "Blocks", "Cmp", "Att", "Cmp%",
            "Prog", "Carries", "Prog", "Succ", "Att", "Match Report", "PKA", 
            "PKsv", "PKm", "Att", "Thr", "Launch%", "AvgLen", "Launch%", "AvgLen", 
            "Opp", "Stp", "Stp%", "#OPA", "AvgDist", "Fls", "Fld", "Off", "Crs",
            "TklW", "OG", "PKwon", "PKcon"
            ]]
        match_log_df = match_log_df.drop(useless_cols, axis=1)
        numeric_columns = [c for c in match_log_df.columns if c in [
            'Min', 'Gls', 'Ast', 'Pk', "PKatt", "Sh", "SoT", "Touches", "xG", 
            "npxG", "xA", "SCA", "GCA", "SoTA", "GA", "Saves", "Save%", "CS",
            "PSxG"]]
        match_log_df[numeric_columns] = match_log_df[numeric_columns].apply(pd.to_numeric)
        match_log_df["Start"] = match_log_df["Start"].str.contains('Y').mul(1)
        match_log_df["Pos"] = match_log_df["Pos"].fillna(pos)
        match_log_df = match_log_df.rename(columns={"Save%":"SavePCT"})

        match_log_df["Season"] = season
        match_log_df["Player"] = player
        match_log_df["Link"] = link
        
        return match_log_df


logging.basicConfig(
    handlers=[logging.FileHandler('scraper.log', 'a', 'utf-8')],
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)


## Crawl Team Match Logs

In [15]:
seasons = [2021-i for i in range(10)]
seasons = [f'{s}-{s+1}' for s in seasons]

logging.info(f"Initializing SQLite connection...")
conn = sqlite3.connect('fpl.db')

logging.info(f"Initializing Chrome Webdriver...")
with FBRefDriver() as d:
    crawled_df = pd.read_sql('select distinct TEAM, SEASON from TEAM_MATCH_LOG', conn)

    for s in tqdm(seasons, desc="Seasons crawled"):
        team_season_links = d.get_team_season_links(s)

        for team, link in tqdm(team_season_links, leave=False, desc="Teams crawled"):
            if not crawled_df.loc[(crawled_df["SEASON"] == s) & (crawled_df["TEAM"] == team)].empty:
                logging.warning(f"{s} {team}\tMatch Log already crawled.\t{link}")
                continue
            
            match_log_df = d.get_team_match_log(s, team, link)
            
            logging.info(f"Saving Match Log:\t{s} {team}")
            match_log_df.to_sql('TEAM_MATCH_LOG', conn, if_exists='append', index=False)
    

Seasons crawled:   0%|          | 0/10 [00:00<?, ?it/s]

Teams crawled:   0%|          | 0/20 [00:00<?, ?it/s]

Teams crawled:   0%|          | 0/20 [00:00<?, ?it/s]

Teams crawled:   0%|          | 0/20 [00:00<?, ?it/s]

Teams crawled:   0%|          | 0/20 [00:00<?, ?it/s]

Teams crawled:   0%|          | 0/20 [00:00<?, ?it/s]

Teams crawled:   0%|          | 0/20 [00:00<?, ?it/s]

Teams crawled:   0%|          | 0/20 [00:00<?, ?it/s]

Teams crawled:   0%|          | 0/20 [00:00<?, ?it/s]

Teams crawled:   0%|          | 0/20 [00:00<?, ?it/s]

Teams crawled:   0%|          | 0/20 [00:00<?, ?it/s]

## Crawl Player Match Logs

In [21]:
seasons = [2021-i for i in range(10)]
seasons = [f'{s}-{s+1}' for s in seasons]

logging.info(f"Initializing SQLite connection...")
conn = sqlite3.connect('fpl.db')

logging.info(f"Initializing Chrome Webdriver...")
with FBRefDriver() as d:
    crawled_df = pd.read_sql('select distinct PLAYER, SEASON from PLAYER_MATCH_LOG', conn)

    for s in tqdm(seasons, desc="Seasons crawled"):
        player_season_links = d.get_player_season_links(s)

        for player, pos, link in tqdm(player_season_links, leave=False, desc="Players crawled"):
            if not crawled_df.loc[(crawled_df["SEASON"] == s) & (crawled_df["PLAYER"] == player)].empty:
                logging.warning(f"{s} {player}\tMatch Log already crawled.\t{link}")
                continue
            
            match_log_df = d.get_player_match_log(s, player, pos, link)
            
            logging.info(f"Saving Match Log:\t{s} {player}")
            match_log_df.to_sql('PLAYER_MATCH_LOG', conn, if_exists='append', index=False)
            crawled_df.loc[len(crawled_df)] = [player, s]

Seasons crawled:   0%|          | 0/10 [00:00<?, ?it/s]

Players crawled:   0%|          | 0/546 [00:00<?, ?it/s]

Players crawled:   0%|          | 0/532 [00:00<?, ?it/s]

Players crawled:   0%|          | 0/522 [00:00<?, ?it/s]

Players crawled:   0%|          | 0/508 [00:00<?, ?it/s]

Players crawled:   0%|          | 0/529 [00:00<?, ?it/s]

Players crawled:   0%|          | 0/543 [00:00<?, ?it/s]

Players crawled:   0%|          | 0/561 [00:00<?, ?it/s]

Players crawled:   0%|          | 0/548 [00:00<?, ?it/s]

Players crawled:   0%|          | 0/561 [00:00<?, ?it/s]

Players crawled:   0%|          | 0/537 [00:00<?, ?it/s]