### **This notebook is where the making of the *scrapingfunction.ipynb* began. I am no deleting this notebook because when I face an error in the future (in the scraping process), I will come back here to see where that error comes from easily.**

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen
import re
from datetime import datetime, time

In [2]:
url = "http://fbref.com/en/matches/72987477/Getafe-Sevilla-August-27-2017-La-Liga"

In [3]:
res = requests.get(url)
soup = BeautifulSoup(res.content, "html.parser")

### **Main Data of Match**
This dataframe will include match information such as matchday, date, time, teams, who played home/away and goals scored. This dataframe will be called *data_of_match*.

In [4]:
# Main Data of Match

teamshtml = soup.find_all("div", {"class":"scorebox"})
matchhtml = soup.find_all("div", {"class":"scores"})

team = []
location = ["Home", "Away"]
score = []
xg = []

for i in range(2):
    team.append(teamshtml[0].findAll("a")[(0+(i*4))].text)

for i in range(2):
    score.append(matchhtml[i].findAll("div")[0].text)

for i in range(2):
    xg.append(matchhtml[i].findAll("div")[1].text)

date_string = "Sunday September 2, 2018"

# Define the format of the date string
date_format = "%A %B %d, %Y"

# Parse the string into a datetime object
date_object = datetime.strptime(soup.find_all("div", {"class":"scorebox_meta"})[0].find_all("a")[0].text, date_format)

matchdata = {}
matchdata["team"] = team
matchdata["matchday"] = ((re.search(r'\((.*?)\)', soup.find_all("div", {"class":"scorebox_meta"})[0].find_all("div")[1].text)).group(1).split(" ")[1])
matchdata["date"] = datetime.strptime(soup.find_all("div", {"class":"scorebox_meta"})[0].find_all("a")[0].text, date_format)
matchdata["time"] = soup.find_all("div", {"class":"scorebox_meta"})[0].find_all("div")[0].find_all("span")[0].text.replace(" (venue time)", "")
matchdata["location"] = location
matchdata["opponent"] = reversed(team)
matchdata["score"] = score
matchdata["score_against"] = reversed(score)
matchdata["xg_against"] = reversed(xg)

data_of_match = pd.DataFrame(matchdata)

data_of_match4 = data_of_match[data_of_match.columns[:6]].replace("", "0")
data_of_match5 = data_of_match[data_of_match.columns[6:]].replace("", "0").astype("float")
data_of_match = pd.concat([data_of_match4, data_of_match5], axis=1)

data_of_match['time'] = data_of_match['time'].apply(lambda x: datetime.strptime(x, '%H:%M').time())
data_of_match['matchday'] = data_of_match['matchday'].astype("int")
data_of_match['score'] = data_of_match['score'].astype("int")
data_of_match['score_against'] = data_of_match['score_against'].astype("int")

data_of_match

Unnamed: 0,team,matchday,date,time,location,opponent,score,score_against,xg_against
0,Getafe,2,2017-08-27,20:15:00,Home,Sevilla,0,1,1.2
1,Sevilla,2,2017-08-27,20:15:00,Away,Getafe,1,0,1.8


### **Match's Main Statitisc**
This dataframe will contain the match's general statistics. This dataframe will be called *match_main_stats*.

In [5]:
# Match's Main Statistics

created_list = [[] for _ in range(3)]
general = []

for i in range(48):
    if (i != 0) and (i != 16) and (i != 32):
        general.append(soup.find_all("div", {"id":"team_stats_extra"})[0].find_all("div")[i].text)

for index, value in enumerate(general):
    list_index = index % 3
    created_list[list_index].append(value.replace("\xa0", ""))

match_main_stats = pd.DataFrame(created_list)

match_main_stats.columns = match_main_stats.iloc[1]

match_main_stats = match_main_stats.drop(1)

match_main_stats.drop(match_main_stats.columns[0], axis=1, inplace=True)

match_main_stats.reset_index().drop(["index"], axis=1)

match_main_stats.insert(loc=0, column='team', value=team)

match_main_stats.columns = [x.lower() for x in match_main_stats.columns]

match_main_stats1 = match_main_stats[match_main_stats.columns[:1]].replace("", "0")
match_main_stats2 = match_main_stats[match_main_stats.columns[1:]].replace("", "0").astype("int")
match_main_stats = pd.concat([match_main_stats1, match_main_stats2], axis=1)

match_main_stats

Unnamed: 0,team,fouls,corners,crosses,touches,tackles,interceptions,aerials won,clearances,offsides,goal kicks,throw ins,long balls
0,Getafe,20,7,23,536,19,22,14,19,3,6,35,89
2,Sevilla,10,5,11,671,25,8,11,46,3,5,26,82


### **All Player Stats**
This dataframe will contain the more in-depth player statistics of the match. This will be called *player_statistics*.

In [6]:
# Player's Statistics
to_concat = range(3, 9)
home_player_statistics = []

for k in to_concat:
    lengthoftable = []
    allhtml = soup.find_all("table")[k]

    for i in allhtml.findAll(attrs={"data-stat": True}):
        if ("header" not in i['data-stat']) and len(i['data-stat']) > 0:
            if re.search(r'scope="col"', str(i)):
                lengthoftable.append(i)

    length = int(len(lengthoftable))

    table = []
    for i in allhtml.findAll(attrs={"data-stat": True}):
        if ("header" not in i['data-stat']) and len(i['data-stat']) > 0:
            table.append(i.text)
        
    lists_of_five = [[] for _ in range(length)]

    for index, value in enumerate(table[:-length]):
        list_index = index % length
        lists_of_five[list_index].append(value.replace("\xa0\xa0\xa0", ""))

    df = pd.DataFrame(lists_of_five).T
    df.columns = df.iloc[0]
    df = df.drop(0)
    df.insert(loc=0, column='team', value=str(soup.find_all("table")[k].find_all("caption")[0].text.replace(" Player Stats Table", "")))
    
    if k > 3:
        new = df.drop(df.columns[[0, 1, 2, 3, 4, 5, 6]],axis = 1)

    if len(home_player_statistics) == 0:
        home_player_statistics = df
    else:
        home_player_statistics = pd.concat([home_player_statistics, new], axis=1)
    
def rename_duplicate_columns(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    
rename_duplicate_columns(home_player_statistics)

to_concat = range(10, 16)
away_player_statistics = []

for k in to_concat:
    lengthoftable = []
    allhtml = soup.find_all("table")[k]

    for i in allhtml.findAll(attrs={"data-stat": True}):
        if ("header" not in i['data-stat']) and len(i['data-stat']) > 0:
            if re.search(r'scope="col"', str(i)):
                lengthoftable.append(i)

    length = int(len(lengthoftable))

    table = []
    for i in allhtml.findAll(attrs={"data-stat": True}):
        if ("header" not in i['data-stat']) and len(i['data-stat']) > 0:
            table.append(i.text)
        
    lists_of_five = [[] for _ in range(length)]

    for index, value in enumerate(table[:-length]):
        list_index = index % length
        lists_of_five[list_index].append(value.replace("\xa0\xa0\xa0", ""))

    df = pd.DataFrame(lists_of_five).T
    df.columns = df.iloc[0]
    df = df.drop(0)
    df.insert(loc=0, column='team', value=str(soup.find_all("table")[k].find_all("caption")[0].text.replace(" Player Stats Table", "")))
    
    if k > 3:
        new = df.drop(df.columns[[0, 1, 2, 3, 4, 5, 6]],axis = 1)

    if len(away_player_statistics) == 0:
        away_player_statistics = df
    else:
        away_player_statistics = pd.concat([away_player_statistics, new], axis=1)
    
def rename_duplicate_columns(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    
rename_duplicate_columns(away_player_statistics)

player_statistics = pd.concat([home_player_statistics, away_player_statistics])
player_statistics = player_statistics.reset_index().drop(["index"], axis=1)

player_statistics5 = player_statistics[player_statistics.columns[:6]].replace("", "0")
player_statistics6 = player_statistics[player_statistics.columns[6:]].replace("", "0").astype("float")
player_statistics = pd.concat([player_statistics5, player_statistics6], axis=1)

player_statistics.drop(["PK", "PKatt", "Min", "Gls"], axis=1, inplace=True)
player_statistics.drop(["Player", "#", "Nation", "Pos", "Age", "Att", "Att_1", "Succ", "Blocks_1"], axis=1, inplace=True)
player_statistics.drop(["xG"], axis=1, inplace=True)
player_statistics.drop(["Cmp_1", "Att_2", "Cmp%_1", "TotDist"], axis=1, inplace=True)
player_statistics.drop(["Cmp_2", "Att_3", "Cmp%_2", "Cmp_3", "Att_4", "Cmp%_3", "Cmp_4", "Att_5", "Cmp%_4", "Ast_1", "xAG_1"], axis=1, inplace=True)
player_statistics.drop(["PrgP_1", "Att_6", "Live", "Dead", "FK"], axis=1, inplace=True)
player_statistics.drop(["Sw", "TI", "In", "Out", "Str", "Cmp_5", "Tkl_1"], axis=1, inplace=True)
player_statistics.drop(["Def 3rd", "Mid 3rd", "Att 3rd"], axis=1, inplace=True)
player_statistics.drop(["Tkl%", "Blocks_2", "Sh_1", "Pass", "Int_1", "Tkl+Int"], axis=1, inplace=True)
player_statistics.drop(["Touches_1", "Def Pen", "Def 3rd_1", "Mid 3rd_1", "Att 3rd_1", "Live_1", "Att_8", "Succ_1", "Succ%"], axis=1, inplace=True)
player_statistics.drop(["Carries_1", "TotDist_1", "PrgDist_1", "PrgC_1", "1/3_1"], axis=1, inplace=True)
player_statistics.drop(["Rec", "PrgR", "CrdY_1", "CrdR_1"], axis=1, inplace=True)
player_statistics.drop(["Crs_1", "Int_2", "TklW_1", "Fld", "Won%", "Tkld%", "Cmp%"], axis=1, inplace=True)

player_statistics.rename(columns={"Gls": "goals"}, inplace=True)
player_statistics.rename(columns={"Ast": "assists"}, inplace=True)
player_statistics.rename(columns={"Sh": "shots"}, inplace=True)
player_statistics.rename(columns={"SoT": "shots_on_target"}, inplace=True)
player_statistics.rename(columns={"CrdY": "yellows"}, inplace=True)
player_statistics.rename(columns={"CrdR": "reds"}, inplace=True)
player_statistics.rename(columns={"Touches": "touches"}, inplace=True)
player_statistics.rename(columns={"Tkl": "tackles"}, inplace=True)
player_statistics.rename(columns={"Int": "interceptions"}, inplace=True)
player_statistics.rename(columns={"Blocks": "blocks_by_player"}, inplace=True)
player_statistics.rename(columns={"npxG": "non_penalty_expected_goals"}, inplace=True)
player_statistics.rename(columns={"xAG": "expected_assisted_goals"}, inplace=True)
player_statistics.rename(columns={"SCA": "shot_creating_actions"}, inplace=True)
player_statistics.rename(columns={"GCA": "goal_creating_actions"}, inplace=True)
player_statistics.rename(columns={"Cmp": "passes_completed"}, inplace=True)
player_statistics.rename(columns={"PrgP": "progressive_passes"}, inplace=True)
player_statistics.rename(columns={"Carries": "carries"}, inplace=True)
player_statistics.rename(columns={"PrgC": "progressive_carries"}, inplace=True)
player_statistics.rename(columns={"PrgP": "progressive_passes"}, inplace=True)
player_statistics.rename(columns={"PrgDist": "progressive_passing_distance"}, inplace=True)
player_statistics.rename(columns={"xA": "expected_assists"}, inplace=True)
player_statistics.rename(columns={"KP": "key_passes"}, inplace=True)
player_statistics.rename(columns={"1/3": "passes_into_final_third"}, inplace=True)
player_statistics.rename(columns={"PPA": "passes_into_penalty_area"}, inplace=True)
player_statistics.rename(columns={"CrsPA": "crosses_into_penalty_area"}, inplace=True)
player_statistics.rename(columns={"TB": "through_balls"}, inplace=True)
player_statistics.rename(columns={"Crs": "crosses"}, inplace=True)
player_statistics.rename(columns={"CK": "corner_kicks"}, inplace=True)
player_statistics.rename(columns={"Off": "passes_offside"}, inplace=True)
player_statistics.rename(columns={"Blocks": "blocks_by_opponent"}, inplace=True)
player_statistics.rename(columns={"TklW": "tackles_won"}, inplace=True)
player_statistics.rename(columns={"Tkl_2": "dribblers_tackled"}, inplace=True)
player_statistics.rename(columns={"Att_7": "dribbler_tackles_attempts"}, inplace=True)
player_statistics.rename(columns={"Lost": "unsuccessful_dribbler_challenge"}, inplace=True)
player_statistics.rename(columns={"Clr": "clearances"}, inplace=True)
player_statistics.rename(columns={"Err": "error_leading_to_opponent_shot"}, inplace=True)
player_statistics.rename(columns={"Att Pen": "touches_in_attacking_penalty_area"}, inplace=True)
player_statistics.rename(columns={"Tkld": "player_take_ons_tackled"}, inplace=True)
player_statistics.rename(columns={"CPA": "carries_into_penalty_box"}, inplace=True)
player_statistics.rename(columns={"Mis": "failed_controls"}, inplace=True)
player_statistics.rename(columns={"Dis": "dispossession_after_tackle"}, inplace=True)
player_statistics.rename(columns={"2CrdY": "double_yellows"}, inplace=True)
player_statistics.rename(columns={"Fls": "fouls"}, inplace=True)
player_statistics.rename(columns={"2CrdY": "double_yellows"}, inplace=True)
player_statistics.rename(columns={"Off_1": "offsides"}, inplace=True)
player_statistics.rename(columns={"PKwon": "penalty_kicks_won"}, inplace=True)
player_statistics.rename(columns={"PKcon": "penalty_kicks_conceded"}, inplace=True)
player_statistics.rename(columns={"OG": "own_goals"}, inplace=True)
player_statistics.rename(columns={"Recov": "recoveries"}, inplace=True)
player_statistics.rename(columns={"Won": "aerials_won"}, inplace=True)
player_statistics.rename(columns={"Lost_1": "aerials_lost"}, inplace=True)

player_statistics = player_statistics.groupby("team").sum().reset_index()
player_statistics.drop(["team"], axis=1, inplace=True)
player_statistics

Unnamed: 0,assists,shots,shots_on_target,yellows,reds,touches,tackles,interceptions,blocks_by_player,non_penalty_expected_goals,...,dispossession_after_tackle,double_yellows,fouls,offsides,penalty_kicks_won,penalty_kicks_conceded,own_goals,recoveries,aerials_won,aerials_lost
0,0.0,15.0,3.0,0.0,0.0,536.0,19.0,22.0,7.0,1.8,...,16.0,0.0,20.0,3.0,0.0,0.0,0.0,68.0,14.0,11.0
1,1.0,6.0,3.0,1.0,0.0,671.0,25.0,8.0,14.0,1.2,...,13.0,0.0,10.0,3.0,0.0,0.0,0.0,60.0,11.0,14.0


### **Goalkeeper Statistics**
This dataframe will store statistics for the goalkeeper type player only. This dataframe will be called *match_goalkeeper_stats*.

In [7]:
# Goalkeepers Table
match_goalkeeper_stats = []

for p in [9, 16]:
    keeperhtml = soup.find_all("table")[p]

    lengthofkeeper = []

    for i in keeperhtml.findAll(attrs={"data-stat": True}):
        if ("header" not in i['data-stat']) and len(i['data-stat']) > 0:
            if re.search(r'scope="col"', str(i)):
                lengthofkeeper.append(i)

    keeperlength = int(len(lengthofkeeper))

    keeperlength

    keeperstable = []
    for i in keeperhtml.findAll(attrs={"data-stat": True}):
        if ("header" not in i['data-stat']) and len(i['data-stat']) > 0:
            keeperstable.append(i.text)
        
    # Initialize five lists to store elements
    lists_of_five = [[] for _ in range(keeperlength)]

    # Iterate over the elements and append them to the respective lists
    for index, value in enumerate(keeperstable):
        list_index = index % keeperlength  # Determine the index of the list to append the value
        lists_of_five[list_index].append(value.replace("\xa0\xa0\xa0", ""))

    df = pd.DataFrame(lists_of_five).T
    df.columns = df.iloc[0]
    df = df.drop(0)
    df.insert(loc=1, column='team', value=str(soup.find_all("table")[p].find_all("caption")[0].text.replace(" Goalkeeper Stats Table", "")))

    if len(match_goalkeeper_stats) == 0:
        match_goalkeeper_stats = df
    else:
        match_goalkeeper_stats = pd.concat([match_goalkeeper_stats, df])

match_goalkeeper_stats3 = match_goalkeeper_stats[match_goalkeeper_stats.columns[:4]].replace("", "0")
match_goalkeeper_stats4 = match_goalkeeper_stats[match_goalkeeper_stats.columns[4:]].replace("", "0").astype("float")
match_goalkeeper_stats = pd.concat([match_goalkeeper_stats3, match_goalkeeper_stats4], axis=1)

match_goalkeeper_stats

Unnamed: 0,Player,team,Nation,Age,Min,SoTA,GA,Saves,Save%,PSxG,...,Att,Launch%,Launch%.1,AvgLen,AvgLen.1,Opp,Stp,Stp%,#OPA,AvgDist
1,Vicente Guaita,Getafe,es ESP,30-229,90.0,3.0,1.0,2.0,66.7,2.0,...,6.0,76.9,83.3,51.2,61.8,8.0,1.0,12.5,3.0,20.4
1,Sergio Rico,Sevilla,es ESP,23-360,90.0,3.0,0.0,3.0,100.0,1.3,...,5.0,47.4,80.0,43.4,57.6,19.0,1.0,5.3,1.0,12.7


### **Match Events**
This Dataframe will contain data on the main plays that happened during the match, named under *match_events*.

In [8]:
# All Tables (except goalkeeper)

lengthoftable = []
allhtml = soup.find_all("table")[17]

for i in allhtml.findAll(attrs={"data-stat": True}):
    if ("header" not in i['data-stat']) and len(i['data-stat']) > 0:
        if re.search(r'scope="col"', str(i)):
            lengthoftable.append(i)

length = int(len(lengthoftable))

table = []
for i in allhtml.findAll(attrs={"data-stat": True}):
    if ("header" not in i['data-stat']) and len(i['data-stat']) > 0:
        table.append(i.text)
    
lists_of_five = [[] for _ in range(length)]

for index, value in enumerate(table[:-length]):
    list_index = index % length
    lists_of_five[list_index].append(value.replace("\xa0\xa0\xa0", ""))

match_events = pd.DataFrame(lists_of_five).T
match_events.columns = match_events.iloc[0]
match_events.drop(0, inplace=True)
match_events.drop(["Player", "Minute", "Outcome", "Body Part", "Notes", "Event"], axis=1, inplace=True)
match_events.replace("", "0", inplace=True)
match_events = match_events[match_events["Squad"] != "0"]
match_events["Distance"] = match_events["Distance"].astype("int")
match_events["xG"] = match_events["xG"].astype("float")
match_events["PSxG"] = match_events["PSxG"].astype("float")
match_events = match_events.iloc[:, [0, 1, 2]]
match_events.rename(columns={"Squad": "team"}, inplace=True)

match_events = match_events.groupby("team").agg({"xG":"sum", "PSxG":"mean"}).reset_index()
match_events.drop(["team"], axis=1, inplace=True)
match_events


Unnamed: 0,xG,PSxG
0,1.82,0.086667
1,0.61,0.224


In [9]:
pd.concat([data_of_match, match_events, player_statistics], axis=1)

Unnamed: 0,team,matchday,date,time,location,score,xG,PSxG,assists,shots,...,dispossession_after_tackle,double_yellows,fouls,offsides,penalty_kicks_won,penalty_kicks_conceded,own_goals,recoveries,aerials_won,aerials_lost
0,Getafe,2,2017-08-27,20:15:00,Home,0,1.82,0.086667,0.0,15.0,...,16.0,0.0,20.0,3.0,0.0,0.0,0.0,68.0,14.0,11.0
1,Sevilla,2,2017-08-27,20:15:00,Away,1,0.61,0.224,1.0,6.0,...,13.0,0.0,10.0,3.0,0.0,0.0,0.0,60.0,11.0,14.0
