# FBRef Data Preprocess

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')


In [2]:
FILE_PATH = os.path.realpath("../data/fbref_data")

In [3]:
SCORES_DATA = os.path.join(FILE_PATH, "scores.csv")

In [4]:
scores_df = pd.read_csv(SCORES_DATA)

In [5]:
scores_df = scores_df[scores_df['Comp'] == 'La Liga']
scores_df = scores_df.sort_values(["Date", "Time"])
scores_df.drop(["Unnamed: 0", "Match Report", "Notes"], axis = 1, inplace = True)
scores_df['Team'] = scores_df['Team'].replace({'Alaves': 'Alavés', 'Almeria': 'Almería', 'Atletico Madrid' : 'Atlético Madrid', 'Real Betis' : 'Betis', 
                                               'Cadiz' : 'Cádiz', 'Cordoba': 'Córdoba', 'Gimnastic' : 'Gimnàstic', 'Deportivo La Coruna' : 'La Coruña',
                                              'Hercules' : 'Hércules', 'Leganes' : 'Leganés', 'Malaga' : 'Málaga', 'Racing Santander' : 'Racing Sant', 'Sporting Gijon' : 'Sporting Gijón'})

start_date = '2000-08-29'
end_date = '2022-05-30'
mask = (scores_df['Date'] > start_date) & (scores_df['Date'] <= end_date)
scores_df = scores_df.loc[mask]
scores_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Team
19184,2000-09-09,,La Liga,Matchweek 1,Sat,Away,L,1,2,Real Madrid,,,,,,,Daudén Ibáñez,Valencia
19315,2000-09-09,,La Liga,Matchweek 1,Sat,Home,W,2,0,Athletic Club,,,,,,,Andradas Asurmendi,La Coruña
19415,2000-09-09,,La Liga,Matchweek 1,Sat,Away,L,1,2,Barcelona,,,,,,,Mejuto González,Málaga
19504,2000-09-09,,La Liga,Matchweek 1,Sat,Away,L,0,2,La Coruña,,,,,,,Andradas Asurmendi,Athletic Club
19542,2000-09-09,,La Liga,Matchweek 1,Sat,Home,W,2,1,Málaga,,,,,,,Mejuto González,Barcelona


In [6]:
scores_df['Date'] = pd.to_datetime(scores_df['Date'], format = "%Y-%m-%d")
scores_df['Time'] = pd.to_datetime(scores_df['Time']).dt.time


In [7]:

scores_df_merge = scores_df.merge(scores_df, left_on = ["Date", "Time", "Comp", "Day", "Round", "Team", "Referee", "Attendance"], right_on = ["Date", "Time", "Comp", "Day", "Round", "Opponent", "Referee", "Attendance"],
                                 suffixes = ("_team", "_opponent"), how = "left")
scores_df_merge

Unnamed: 0,Date,Time,Comp,Round,Day,Venue_team,Result_team,GF_team,GA_team,Opponent_team,...,Result_opponent,GF_opponent,GA_opponent,Opponent_opponent,xG_opponent,xGA_opponent,Poss_opponent,Captain_opponent,Formation_opponent,Team_opponent
0,2000-09-09,,La Liga,Matchweek 1,Sat,Away,L,1,2,Real Madrid,...,W,2,1,Valencia,,,,,,Real Madrid
1,2000-09-09,,La Liga,Matchweek 1,Sat,Home,W,2,0,Athletic Club,...,L,0,2,La Coruña,,,,,,Athletic Club
2,2000-09-09,,La Liga,Matchweek 1,Sat,Away,L,1,2,Barcelona,...,W,2,1,Málaga,,,,,,Barcelona
3,2000-09-09,,La Liga,Matchweek 1,Sat,Away,L,0,2,La Coruña,...,W,2,0,Athletic Club,,,,,,La Coruña
4,2000-09-09,,La Liga,Matchweek 1,Sat,Home,W,2,1,Málaga,...,L,1,2,Barcelona,,,,,,Málaga
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16715,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Away,L,0,1,Sevilla,...,W,1,0,Athletic Club,1.4,0.7,65.0,Jesús Navas,4-3-3,Sevilla
16716,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,W,1,0,Athletic Club,...,L,0,1,Sevilla,0.7,1.4,35.0,Iker Muniain,4-4-2,Athletic Club
16717,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,1,2,Atlético Madrid,...,W,2,1,Real Sociedad,0.8,2.3,39.0,Koke,3-5-2,Atlético Madrid
16718,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Away,W,2,0,Barcelona,...,L,0,2,Villarreal,0.7,0.7,70.0,Sergio Busquets,4-3-3,Barcelona


In [8]:
scores_df_merge['Home'] = np.where(scores_df_merge['Venue_team'] == 'Home', scores_df_merge['Team_team'], scores_df_merge['Opponent_team'])
scores_df_merge['Away'] = np.where(scores_df_merge['Team_team'] == scores_df_merge['Home'], scores_df_merge['Opponent_team'], scores_df_merge['Team_team'])
scores_df_merge['HomeGoal'] = np.where(scores_df_merge['Team_team'] == scores_df_merge['Home'], scores_df_merge['GF_team'], scores_df_merge['GA_team'])
scores_df_merge['HomeGoal'] = scores_df_merge['HomeGoal'].astype(float).astype(int)
scores_df_merge['xG_Home'] = np.where(scores_df_merge['Team_team'] == scores_df_merge['Home'], scores_df_merge['xG_team'], scores_df_merge['xGA_team'])
scores_df_merge['AwayGoal'] = np.where(scores_df_merge['Team_team'] == scores_df_merge['Away'], scores_df_merge['GF_team'], scores_df_merge['GA_team'])
scores_df_merge['AwayGoal'] = scores_df_merge['AwayGoal'].astype(float).astype(int)
scores_df_merge['xG_Away'] = np.where(scores_df_merge['Team_team'] == scores_df_merge['Away'], scores_df_merge['xG_team'], scores_df_merge['xGA_team'])
scores_df_merge['HomeFormation'] = np.where(scores_df_merge['Team_team'] == scores_df_merge['Home'], scores_df_merge['Formation_team'], scores_df_merge['Formation_opponent'])
scores_df_merge['AwayFormation'] = np.where(scores_df_merge['Team_team'] == scores_df_merge['Away'], scores_df_merge['Formation_team'], scores_df_merge['Formation_opponent'])
scores_df_merge['HomePossession'] = np.where(scores_df_merge['Team_team'] == scores_df_merge['Home'], scores_df_merge['Poss_team'], scores_df_merge['Poss_opponent'])
scores_df_merge['AwayPossession'] = np.where(scores_df_merge['Team_team'] == scores_df_merge['Away'], scores_df_merge['Poss_team'], scores_df_merge['Poss_opponent'])
scores_df_merge['HomeCaptain'] = np.where(scores_df_merge['Team_team'] == scores_df_merge['Home'], scores_df_merge['Captain_team'], scores_df_merge['Captain_opponent'])
scores_df_merge['AwayCaptain'] = np.where(scores_df_merge['Team_team'] == scores_df_merge['Away'], scores_df_merge['Captain_team'], scores_df_merge['Captain_opponent'])

scores_df_merge

Unnamed: 0,Date,Time,Comp,Round,Day,Venue_team,Result_team,GF_team,GA_team,Opponent_team,...,HomeGoal,xG_Home,AwayGoal,xG_Away,HomeFormation,AwayFormation,HomePossession,AwayPossession,HomeCaptain,AwayCaptain
0,2000-09-09,,La Liga,Matchweek 1,Sat,Away,L,1,2,Real Madrid,...,2,,1,,,,,,,
1,2000-09-09,,La Liga,Matchweek 1,Sat,Home,W,2,0,Athletic Club,...,2,,0,,,,,,,
2,2000-09-09,,La Liga,Matchweek 1,Sat,Away,L,1,2,Barcelona,...,2,,1,,,,,,,
3,2000-09-09,,La Liga,Matchweek 1,Sat,Away,L,0,2,La Coruña,...,2,,0,,,,,,,
4,2000-09-09,,La Liga,Matchweek 1,Sat,Home,W,2,1,Málaga,...,2,,1,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16715,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Away,L,0,1,Sevilla,...,1,1.4,0,0.7,4-3-3,4-4-2,65.0,35.0,Jesús Navas,Iker Muniain
16716,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,W,1,0,Athletic Club,...,1,1.4,0,0.7,4-3-3,4-4-2,65.0,35.0,Jesús Navas,Iker Muniain
16717,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,1,2,Atlético Madrid,...,1,2.3,2,0.8,4-4-2◆,3-5-2,61.0,39.0,Asier Illarramendi,Koke
16718,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Away,W,2,0,Barcelona,...,0,0.7,2,0.7,4-3-3,4-4-2,70.0,30.0,Sergio Busquets,Mario Gaspar


In [9]:
# Drop unnecessary columns
scores_df_merge = scores_df_merge.loc[:, ["Date", "Time", "Comp", "Round", "Day", "Attendance", "Referee", "Home", "Away", "HomeGoal",
                        "xG_Home", "AwayGoal", "xG_Away", "HomeFormation", "AwayFormation", "HomePossession", "AwayPossession",
                        "HomeCaptain", "AwayCaptain"]]
scores_df_merge

Unnamed: 0,Date,Time,Comp,Round,Day,Attendance,Referee,Home,Away,HomeGoal,xG_Home,AwayGoal,xG_Away,HomeFormation,AwayFormation,HomePossession,AwayPossession,HomeCaptain,AwayCaptain
0,2000-09-09,,La Liga,Matchweek 1,Sat,,Daudén Ibáñez,Real Madrid,Valencia,2,,1,,,,,,,
1,2000-09-09,,La Liga,Matchweek 1,Sat,,Andradas Asurmendi,La Coruña,Athletic Club,2,,0,,,,,,,
2,2000-09-09,,La Liga,Matchweek 1,Sat,,Mejuto González,Barcelona,Málaga,2,,1,,,,,,,
3,2000-09-09,,La Liga,Matchweek 1,Sat,,Andradas Asurmendi,La Coruña,Athletic Club,2,,0,,,,,,,
4,2000-09-09,,La Liga,Matchweek 1,Sat,,Mejuto González,Barcelona,Málaga,2,,1,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16715,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,31305.0,Carlos del Cerro,Sevilla,Athletic Club,1,1.4,0,0.7,4-3-3,4-4-2,65.0,35.0,Jesús Navas,Iker Muniain
16716,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,31305.0,Carlos del Cerro,Sevilla,Athletic Club,1,1.4,0,0.7,4-3-3,4-4-2,65.0,35.0,Jesús Navas,Iker Muniain
16717,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,23586.0,Jesús Gil,Real Sociedad,Atlético Madrid,1,2.3,2,0.8,4-4-2◆,3-5-2,61.0,39.0,Asier Illarramendi,Koke
16718,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,54850.0,José Luis Munuera,Barcelona,Villarreal,0,0.7,2,0.7,4-3-3,4-4-2,70.0,30.0,Sergio Busquets,Mario Gaspar


In [10]:
scores_df_merge_final = scores_df_merge.drop_duplicates()
scores_df_merge_final

Unnamed: 0,Date,Time,Comp,Round,Day,Attendance,Referee,Home,Away,HomeGoal,xG_Home,AwayGoal,xG_Away,HomeFormation,AwayFormation,HomePossession,AwayPossession,HomeCaptain,AwayCaptain
0,2000-09-09,NaT,La Liga,Matchweek 1,Sat,,Daudén Ibáñez,Real Madrid,Valencia,2,,1,,,,,,,
1,2000-09-09,NaT,La Liga,Matchweek 1,Sat,,Andradas Asurmendi,La Coruña,Athletic Club,2,,0,,,,,,,
2,2000-09-09,NaT,La Liga,Matchweek 1,Sat,,Mejuto González,Barcelona,Málaga,2,,1,,,,,,,
6,2000-09-09,NaT,La Liga,Matchweek 1,Sat,,Prados García,Real Sociedad,Racing Sant,2,,2,,,,,,,
7,2000-09-09,NaT,La Liga,Matchweek 1,Sat,,Turienzo Álvarez,Zaragoza,Espanyol,1,,2,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16709,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,11138.0,José Sánchez,Alavés,Cádiz,0,0.4,1,1.9,4-2-3-1,4-4-2,49.0,51.0,Rubén Duarte,Álex Fernández
16711,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,17951.0,Alejandro Hernández,Granada,Espanyol,0,2.0,0,0.4,4-3-3,4-4-2,40.0,60.0,Víctor Díaz,David López
16714,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,23586.0,Jesús Gil,Real Sociedad,Atlético Madrid,1,2.3,2,0.8,4-4-2◆,3-5-2,61.0,39.0,Asier Illarramendi,Koke
16715,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,31305.0,Carlos del Cerro,Sevilla,Athletic Club,1,1.4,0,0.7,4-3-3,4-4-2,65.0,35.0,Jesús Navas,Iker Muniain


In [11]:
SCORES_OVERALL_DATA = os.path.join(FILE_PATH, "scores_overall.csv")
scores_overall_df = pd.read_csv(SCORES_OVERALL_DATA)
scores_overall_df = scores_overall_df.sort_values(["Date", "Time"])
scores_overall_df.drop(["Unnamed: 0", "Match Report", "Notes"], axis = 1, inplace = True)
scores_overall_df = scores_overall_df[scores_overall_df['Wk'].notnull()]

start_date = '2000-08-29'
end_date = '2022-05-30'
mask = (scores_overall_df['Date'] > start_date) & (scores_overall_df['Date'] <= end_date)
scores_overall_df = scores_overall_df.loc[mask]
scores_overall_df

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee
4613,1.0,Sat,2000-09-09,,Zaragoza,,1–2,,Espanyol,,La Romareda,Turienzo Álvarez
4614,1.0,Sat,2000-09-09,,Barcelona,,2–1,,Málaga,,Nou Camp,Mejuto González
4615,1.0,Sat,2000-09-09,,La Coruña,,2–0,,Athletic Club,,Nuevo Riazor,Andradas Asurmendi
4616,1.0,Sat,2000-09-09,,Real Madrid,,2–1,,Valencia,,Santiago Bernabéu,Daudén Ibáñez
4617,1.0,Sat,2000-09-09,,Real Sociedad,,2–2,,Racing Sant,,Anoeta,Prados García
...,...,...,...,...,...,...,...,...,...,...,...,...
1255,38.0,Sun,2022-05-22,20:00,Alavés,0.4,0–1,1.9,Cádiz,11138.0,Estadio de Mendizorroza,José Sánchez
1256,38.0,Sun,2022-05-22,20:00,Granada,2.0,0–0,0.4,Espanyol,17951.0,Estadio Nuevo Los Cármenes,Alejandro Hernández
1257,38.0,Sun,2022-05-22,22:00,Real Sociedad,2.3,1–2,0.8,Atlético Madrid,23586.0,Estadio Municipal de Anoeta,Jesús Gil
1258,38.0,Sun,2022-05-22,22:00,Barcelona,0.7,0–2,0.7,Villarreal,54850.0,Camp Nou,José Luis Munuera


In [12]:
scores_overall_df['Date'] = pd.to_datetime(scores_overall_df['Date'], format = "%Y-%m-%d")
scores_overall_df['Time'] = pd.to_datetime(scores_overall_df['Time']).dt.time
scores_overall_df.columns
# pd.to_datetime(scores_overall_df['Date']).dt.date.dtype

Index(['Wk', 'Day', 'Date', 'Time', 'Home', 'xG', 'Score', 'xG.1', 'Away',
       'Attendance', 'Venue', 'Referee'],
      dtype='object')

In [13]:

scores_overall_df[['HomeGoal', 'AwayGoal']] = scores_overall_df.Score.str.split('–',expand = True)
scores_overall_df.rename(columns = {'xG': 'xG_Home', 'xG.1':'xG_Away'}, inplace = True)
scores_overall_df

Unnamed: 0,Wk,Day,Date,Time,Home,xG_Home,Score,xG_Away,Away,Attendance,Venue,Referee,HomeGoal,AwayGoal
4613,1.0,Sat,2000-09-09,,Zaragoza,,1–2,,Espanyol,,La Romareda,Turienzo Álvarez,1,2
4614,1.0,Sat,2000-09-09,,Barcelona,,2–1,,Málaga,,Nou Camp,Mejuto González,2,1
4615,1.0,Sat,2000-09-09,,La Coruña,,2–0,,Athletic Club,,Nuevo Riazor,Andradas Asurmendi,2,0
4616,1.0,Sat,2000-09-09,,Real Madrid,,2–1,,Valencia,,Santiago Bernabéu,Daudén Ibáñez,2,1
4617,1.0,Sat,2000-09-09,,Real Sociedad,,2–2,,Racing Sant,,Anoeta,Prados García,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1255,38.0,Sun,2022-05-22,20:00:00,Alavés,0.4,0–1,1.9,Cádiz,11138.0,Estadio de Mendizorroza,José Sánchez,0,1
1256,38.0,Sun,2022-05-22,20:00:00,Granada,2.0,0–0,0.4,Espanyol,17951.0,Estadio Nuevo Los Cármenes,Alejandro Hernández,0,0
1257,38.0,Sun,2022-05-22,22:00:00,Real Sociedad,2.3,1–2,0.8,Atlético Madrid,23586.0,Estadio Municipal de Anoeta,Jesús Gil,1,2
1258,38.0,Sun,2022-05-22,22:00:00,Barcelona,0.7,0–2,0.7,Villarreal,54850.0,Camp Nou,José Luis Munuera,0,2


# Merge Tables
Merge the Scores Overall Data to the Scores by Team Data


In [14]:
print(scores_overall_df.columns)
print()
print(scores_df_merge_final.columns)

Index(['Wk', 'Day', 'Date', 'Time', 'Home', 'xG_Home', 'Score', 'xG_Away',
       'Away', 'Attendance', 'Venue', 'Referee', 'HomeGoal', 'AwayGoal'],
      dtype='object')

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Attendance', 'Referee', 'Home',
       'Away', 'HomeGoal', 'xG_Home', 'AwayGoal', 'xG_Away', 'HomeFormation',
       'AwayFormation', 'HomePossession', 'AwayPossession', 'HomeCaptain',
       'AwayCaptain'],
      dtype='object')


In [15]:
# scores_merge = scores_overall_df.merge(scores_df_merge_final, on = ["Date", "Time", "Day", "Referee", "Attendance", "Home", "Away",
#                                                        "HomeGoal", "AwayGoal", "xG_Home", "xG_Away"], how = "inner")
# scores_merge