In [1]:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException

from bs4 import BeautifulSoup
from lxml import html
import requests
import cssselect

import pandas as pd
import numpy as np
import re
from itertools import cycle

from time import sleep, time
from random import randint

from IPython.core.display import clear_output


In [7]:
# https://www.scrapehero.com/how-to-rotate-proxies-and-ip-addresses-using-python-3/
def get_proxies():
    url = 'https://free-proxy-list.net/'
    response = requests.get(url)
    parser = html.fromstring(response.text)
    proxies = set()
    for i in parser.xpath('//tbody/tr')[:50]:
        if i.xpath('.//td[7][contains(text(),"yes")]'):
            #Grabbing IP and corresponding PORT
            proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
            proxies.add(proxy)
    return list(proxies)


In [8]:
def create_browser(proxies):
    i = randint(0, len(proxies)-1)
    proxy_port = proxies[i]
    
    proxy = proxy_port.split(':')[0]
    port = proxy_port.split(':')[1]

    profile = webdriver.FirefoxProfile()
    profile.set_preference("network.proxy.type", 1)
    profile.set_preference("network.proxy.http", proxy)
    profile.set_preference("network.proxy.http_port", port)
    profile.set_preference("network.proxy.ssl", proxy)
    profile.set_preference("network.proxy.ssl_port", port)
    profile.update_preferences()

    return webdriver.Firefox(firefox_profile=profile)


In [2]:
def team_links(url):

    #url = 'https://www.premierleague.com/clubs'    
    TeamSquadLinks = []
    TeamStatsLinks = []
    
    browser = webdriver.Firefox()
    browser.get(url)
    
    soup = BeautifulSoup(browser.page_source,"lxml")
    allTeams = soup.findAll('td', class_ = 'team')
    for team in allTeams:
        
        temp = team.a['href']
        tempSquad = "https:" + temp.replace("overview", "squad")
        tempStats = "https:" + temp.replace("overview", "stats")      
        
        TeamSquadLinks.append(tempSquad)
        TeamStatsLinks.append(tempStats)
        
    browser.close()
        
    return TeamSquadLinks, TeamStatsLinks

In [3]:
def player_Links(TeamSquadLinks):
       
    playerLinkOverview = []
    playerLinkStats = []

    #For each team link page...
    for teamUrl in TeamSquadLinks:        
        browser = create_browser(proxies)
        browser.get(teamUrl)
        soup = BeautifulSoup(browser.page_source,"lxml")

        allPlayers = soup.findAll('a', class_ = 'playerOverviewCard active')
        for player in allPlayers:
            temp = "https://www.premierleague.com/" + player['href']
            tempUrl = temp.replace("overview", "stats")
            playerLinkStats.append(tempUrl)
            playerLinkOverview.append(temp)
        browser.close()

    return playerLinkOverview, playerLinkStats

In [4]:
def process_stat(tempstat):
    
    pattern = re.compile('\W')
    tempstat = re.sub(r"^\s+", "", tempstat)
    tempstat = re.sub(r"\s+$", "", tempstat)
    result = tempstat.rsplit(' ',1)
    result[0] = re.sub(pattern, '', result[0])
    result[1] = re.sub(pattern, '', result[1])

    stat = result[0]
    value = result[1]
    
    return stat, value

def process_overview(tempstat):
    
    pattern = re.compile('\W')
    tempstat = re.sub(r"^\s+", "", tempstat)
    tempstat = re.sub(r"\s+$", "", tempstat)
    
    result = tempstat.rsplit(' ',1)
    result[0] = re.sub(pattern, '', result[0])
    result[1] = re.sub(pattern, '', result[1])

    stat = result[0]
    value = result[1]
    
    return stat, value

In [5]:
def team_statistics(TeamStatsLinks, seasons, proxies): 

    teamStatsDF = pd.DataFrame()
    t = 0
    for season, season_id in seasons.items():
        for teamUrl in TeamStatsLinks:
            url = teamUrl + "?se=" + str(season_id)
            browser = create_browser(proxies)
            browser.get(url)

            soup = BeautifulSoup(browser.page_source,"lxml")
            teamName = soup.find_all('title')[0].text
            teamName = teamName.replace(' Statistics | Premier League', "")
            TeamID = match = re.findall(r'\/(\d+)\/',teamUrl)[0]

            teamStatsDF.at[t, 'Season'] = season
            teamStatsDF.at[t, 'Season_Id'] = season_id
            teamStatsDF.at[t, 'TeamID'] = TeamID.strip()
            teamStatsDF.at[t, 'teamName'] = teamName.strip()

            topStat = soup.find_all('div', class_ = 'topStat')
            for stats in topStat:
                tempstats = str(stats.span.text)
                tempStat, tempValue = process_stat(tempstats)
                teamStatsDF.at[t, tempStat] = tempValue.strip()

            normalStat = soup.find_all('div', class_ = 'normalStat')
            for stats in topStat:
                tempstats = str(stats.span.text)
                tempStat, tempValue = process_stat(tempstats)
                teamStatsDF.at[t, tempStat] = tempValue.strip()
            t = t + 1
            browser.close()
                
            sleep(randint(1, 5))
            
        teamStatsDF.to_csv('data/teamStats_' + season + '.csv', index=False)
        
    return teamStatsDF

In [6]:
def player_statistics(playerLinkStats, seasons, proxies): 

    playerStatsDF = pd.DataFrame()
    t = 0
    for season, season_id in seasons.items():
        for playerUrl in playerLinkStats:
            
            url = playerUrl + "?se=" + str(season_id)
            browser = create_browser(proxies)
            browser.get(url)
                
            soup = BeautifulSoup(browser.page_source,"lxml")
            playerName = soup.find_all('title')[0].text
            playerName = playerName.replace(' Statistics | Premier League', "")
            playerID = match = re.findall(r'\/(\d+)\/',playerUrl)[0]
                
            playerStatsDF.at[t, 'Season'] = season
            playerStatsDF.at[t, 'Season_Id'] = season_id
            playerStatsDF.at[t, 'PlayerName'] = playerName.strip()
            playerStatsDF.at[t, 'PlayerID'] = playerID.strip()
            
            #position = soup.find_all('div', class_ = 'info')
            #playerStatsDF.at[t, 'Position'] = position.text.strip()
            infos = soup.find_all('div', class_ = 'info')
            labels = soup.find_all('div', class_ = 'label')
            for i in range(len(labels)):
                field = labels[i].text.strip()
                playerStatsDF.at[t, field] = infos[i].text.strip()
            
            topStat = soup.find_all('div', class_ = 'topStat')
            for stats in topStat:
                tempstats = str(stats.span.text)
                tempStat, tempValue = process_stat(tempstats)
                playerStatsDF.at[t, tempStat] = tempValue.strip()

            normalStat = soup.find_all('div', class_ = 'normalStat')
            for stats in normalStat:
                tempstats = str(stats.span.text)
                tempStat, tempValue = process_stat(tempstats)
                playerStatsDF.at[t, tempStat] = tempValue.strip()

            t = t + 1
            browser.close()
            sleep(randint(1, 5))  # sleep time before
            
        playerStatsDF.to_csv('data/playerStats_' + season + '.csv', index=False)
        
    return playerStatsDF

In [None]:
if __name__ == "__main__":
    seasons = {
        "epl_all":-1,
        "epl_2018_19":210,
        "epl_2017_18":79,
        "epl_2016_17":54,
        "epl_2015_16":42,
        "epl_2014_15":27,
        "epl_2013_14":22,
        "epl_2012_13":21,
        "epl_2011_12":20,
        "epl_2010_11":19,
        "epl_2009_10":18,
        "epl_2008_09":17,
        "epl_2007_08":16,
        "epl_2006_07":15
    }

    proxies = get_proxies()
    
    url = 'https://www.premierleague.com/clubs'
    TeamSquadLinks, TeamStatsLinks = team_links(url)
    playerLinkOverview, playerLinkStats = player_Links(TeamSquadLinks)
    
    playerStatsDF = player_statistics(playerLinkStats, seasons, proxies)
    playerStatsDF.to_csv('data/playerStatsDF.csv', index=False)
    
    #sleep(60)
    teamStatsDF = team_statistics(TeamStatsLinks, seasons, proxies)
    teamStatsDF.to_csv('data/teamStatsDF.csv', index=False)