## Web scraping methodology

SkySports website has a webpage which displays a list of all completed games of a selected season. Every game has the final score and the associated link attached to it. This link leads to the webpage of the individual game which has details about who played in the game and other stats. 

A function <b>- 'LookForMatches' -</b> is defined which loops through all the game page links on the main fixture list page. For every link, a spider function <b>- 'GetMatchData' -</b> is called which goes to the individual page and scrapes the team names, final result and lineups data. 

In [99]:
import numpy as np
import pandas as pd 
import datetime as dt
from bs4 import BeautifulSoup
import requests
import csv
import time

## Define functions to scrape game data

In [100]:
#Specify season for scraping data

season='2019-20'

In [101]:
def AddTeamsToURL(URL):
    
    """ Navigate to the lineups section of the game page by adding 'teams' to the skysports link"""
    
    URL=URL.split('/')
    URL.insert(5,'teams')
    URL='/'.join(URL)
    return URL


'https://www.skysports.com/football/watford-vs-man-utd/teams/408159'

In [102]:
def CalculateResult(scorelist):
    
    """ Calculate result of the game based on goals scored by both teams """
    
    if scorelist[0]>scorelist[1]:
        return 'HW'
    elif scorelist[0]==scorelist[1]:
        return 'D'
    else:
        return 'HL'

In [103]:
def GetMatchData(URL,match_no,matchlist):
    
    """ Scrape required data from the game page """
    
    URL=AddTeamsToURL(URL)
    source = requests.get(URL).text
    soup2 = BeautifulSoup(source, 'lxml')
    
    #Scrape home and away teams
    teamlist=[]
    for team in soup2.findAll('abbr',{"class": "swap-text--bp10"}):
        teamlist.append(team['title'])
        
    #Scrape home and away goals  
    scorelist=[]
    for score in soup2.findAll('span',{"class": "match-head__score"}):
        scorelist.append(int(score.text.strip()))
        
    #Calculate match results    
    result=CalculateResult(scorelist)
    
    #Scrape player jersey numbers
    i=1
    for player in soup2.findAll('span',{"class": "team-lineups__list-player-number"}):
        plyr=player.text.strip()
        if i<=11:
            matchlist.append([match_no,teamlist[0],teamlist[1],scorelist[0],scorelist[1],result,'Home',plyr,'Starter'])
        elif i>11 and i<=18:
            matchlist.append([match_no,teamlist[0],teamlist[1],scorelist[0],scorelist[1],result,'Home',plyr,'Sub'])
        elif i>18 and i<=29:
            matchlist.append([match_no,teamlist[0],teamlist[1],scorelist[0],scorelist[1],result,'Away',plyr,'Starter'])
        else:
            matchlist.append([match_no,teamlist[0],teamlist[1],scorelist[0],scorelist[1],result,'Away',plyr,'Sub'])
        i+=1
            
    
    

In [104]:
def LookForMatches(FixturesURL,matchlist,NumGames=999):
    
    """ 
    Get game page URLs from Skysports main game listing page
    
    NumGames=Number of games to be pulled
    
    """
    
    source = requests.get(FixturesURL).text
    soup = BeautifulSoup(source, 'lxml')
    i=1
    for match in soup.findAll('a',{"class": "matches__item matches__link"}):      
        matchURL=match['href']
        GetMatchData(matchURL,i,matchlist)
        i+=1
        time.sleep(np.random.randint(4,high=12))
        if i>NumGames:
            break

    

## Scrape game data for one season

In [105]:
def PlayerID(Home_Team,Away_Team,Player_Team,Player):
    
    """ Create player lookup ID by concatenating player's team and jersey numbers"""
    
    if Player_Team=='Home':
        return Home_Team+"#"+str(Player)
    else:
        return Away_Team+"#"+str(Player)

def DataforOneSeason(season,NumGames):
    
    """ Scrape Data from specified season and load it into a DataFrame"""
    
    matchlist=[] #matrix used to store all game data for one season

    SkyURL='https://www.skysports.com/premier-league-results/'+season

    LookForMatches(SkyURL,matchlist,NumGames) #Run function to scrape all data for specified season

    # Required columns for final game dataset

    cols=['Match_no','Home_Team','Away_Team','Home_Goals','Away_goals','Result','Player_team','Player','Type']

    AllMatchData=pd.DataFrame(matchlist,columns=cols)

    # Add player ID column to game dataset

    AllMatchData['Player_ID']=AllMatchData.apply(lambda x:PlayerID(x['Home_Team'],x['Away_Team'],x['Player_team'],x['Player']),axis=1)
    
    return AllMatchData

In [106]:
#AllMatchData.to_excel('AllEPLMatchData.xlsx')

## Get FIFA data

In [107]:
def GetFIFAdata(season):
    
    """Calculate FIFA version corresponding to specified season"""

    FIFAversion=season[:2]

    FIFA_path='FIFA data\players_{}.csv'.format(FIFAversion)

    FifaData=pd.read_csv(FIFA_path)


    #Specify clubs in English Premier League

    EPLlist=['Arsenal','Bournemouth','Brighton and Hove Albion','Burnley','Cardiff City','Chelsea','Crystal Palace','Everton','Fulham','Huddersfield Town','Hull City','Leicester City','Liverpool','Middlesbrough','Manchester City','Manchester United','Newcastle United','Southampton','Tottenham Hotspur','Watford','West Ham United','Wolverhampton Wanderers','Stoke City','Swansea City','Sheffield United','West Bromwich Albion']


    #Filter FIFA Data for EPL clubs

    FifaData=FifaData[FifaData['club'].isin(EPLlist)]

    #Restrict FIFA data to required fields

    FifaData=FifaData[['short_name','overall','club','team_jersey_number','pace','shooting','passing','dribbling','defending','physic']]

    #Create player ID field for future merge with game dataset

    FifaData['Player_ID']=FifaData.club+'#'+FifaData.team_jersey_number.astype(int).astype(str)


    #Fill any missing values with 0

    FifaData.fillna(value=0,inplace=True)

    #Re-order columns with Player ID as the first field

    FifaData=FifaData[['Player_ID','short_name','club','overall','team_jersey_number','pace','shooting','passing','dribbling','defending','physic']]
    
    return FifaData

## Combine game and FIFA data

In [108]:
def CombineData(AllMatchData,FifaData):
    
    """ Merge the game data with FIFA player data """
    
    CombinedData=AllMatchData.merge(FifaData,how='left',on='Player_ID')
    
    return CombinedData

## Convert to format needed for machine learning input

In [109]:
def ConvertToMLFormat(CombinedData,PlayerType='Starter'):

    #Filter for starter or sub

    CombinedData=CombinedData[CombinedData['Type']==PlayerType]

    # Sum up attributes for every team

    CombinedData=CombinedData.groupby(by=['Match_no','Player_team','Result'],as_index=False).sum()
    CombinedData.head()

    #Filter for required columns

    CombinedData=CombinedData[['Match_no', 'Result','Player_team','overall','pace', 'shooting', 'passing', 'dribbling','defending', 'physic']]

    # Divide Dataframe into Home and Away

    CombinedData_H=CombinedData[CombinedData['Player_team']=='Home']

    CombinedData_A=CombinedData[CombinedData['Player_team']=='Away']

    CombinedData_H.reset_index(drop=True,inplace=True)

    CombinedData_A.reset_index(drop=True,inplace=True)

    CombinedData_A.head()

    # Create Dataframe depicting difference in player attributes between Home and Away

    Match_Qual_Diff=CombinedData_H.iloc[:,-7:].subtract(CombinedData_A.iloc[:,-7:])

    #Add Result to the attribute difference Dataframe

    Match_Qual_Diff['Result']=CombinedData_H['Result'].values

    return Match_Qual_Diff

## Run all functions for specified season

In [110]:
AllMatchData=DataforOneSeason(season,20) #Get web scraped lineups for specified number of games and season

In [111]:
FifaData=GetFIFAdata(season) #Get specified FIFA version player data

In [112]:
CombinedData=CombineData(AllMatchData,FifaData) #Combine game lineup and FIFA data

In [113]:
Match_Qual_Diff=ConvertToMLFormat(CombinedData,'Starter') #Create the difference in player attributes dataframe

In [114]:
Match_Qual_Diff.to_excel('{} EPL ML data.xlsx'.format(season))