# Transform the Scores By Team
We want to get rid the duplicate match results so they align with out scores_overall data

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pathlib import Path
import warnings
import psycopg2
from sqlalchemy import create_engine
warnings.filterwarnings('ignore')
import sys

sys.path.insert(0, os.path.realpath('..\src'))
from database.database_config import DB_NAME, DB_USER, DB_PASSWORD, DB_HOST

In [2]:
db = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}")
conn = db.connect()

In [3]:
scores_by_team_df = pd.read_sql("select * from laligadb.laliga.scores_by_team order by _date_", con=conn)
scores_by_team_df

Unnamed: 0,_date_,_time_,comp,round,_day_,venue,_result_,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,team
0,2014-08-23,21:00:00,La Liga,Matchweek 1,Sat,Home,D,1.0,1.0,Valencia,,,47.0,29352.0,,,Carlos del Cerro,Sevilla
1,2014-08-23,21:00:00,La Liga,Matchweek 1,Sat,Away,L,1.0,2.0,Granada,,,47.0,14800.0,,,Iñaki Bikandi,Deportivo La Coruna
2,2014-08-23,23:00:00,La Liga,Matchweek 1,Sat,Home,D,1.0,1.0,Espanyol,,,57.0,12000.0,,,Alejandro Hernández,Almeria
3,2014-08-23,19:00:00,La Liga,Matchweek 1,Sat,Home,W,1.0,0.0,Athletic Club,,,40.0,24500.0,,,Antonio Matéu Lahoz,Malaga
4,2014-08-23,21:00:00,La Liga,Matchweek 1,Sat,Home,W,2.0,1.0,La Coruña,,,53.0,14800.0,,,Iñaki Bikandi,Granada
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6295,2022-10-23,21:00:00,La Liga,Matchweek 11,Sun,Away,,,,Barcelona,,,,,,,,Athletic Club
6296,2022-10-23,18:30:00,La Liga,Matchweek 11,Sun,Home,,,,Osasuna,,,,,,,,Girona
6297,2022-10-23,18:30:00,La Liga,Matchweek 11,Sun,Away,,,,Girona,,,,,,,,Osasuna
6298,2022-10-24,21:00:00,La Liga,Matchweek 11,Mon,Home,,,,Getafe,,,,,,,,Celta Vigo


In [4]:
scores_by_team_df.dtypes

_date_         object
_time_         object
comp           object
round          object
_day_          object
venue          object
_result_       object
gf            float64
ga            float64
opponent       object
xg            float64
xga           float64
poss          float64
attendance    float64
captain        object
formation      object
referee        object
team           object
dtype: object

## Select Season Data
We want to remove any data from this current season in the dataset

In [5]:
start_date = pd.to_datetime('2017-08-01').date()
end_date = pd.to_datetime('2022-05-30').date()
mask = (scores_by_team_df['_date_'] > start_date) & (scores_by_team_df['_date_'] <= end_date)
scores_by_team_df = scores_by_team_df.loc[mask]
scores_by_team_df

Unnamed: 0,_date_,_time_,comp,round,_day_,venue,_result_,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,team
2280,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Home,W,1.0,0.0,Alavés,1.4,1.2,54.0,9231.0,Martín Mantovani,4-2-3-1,José Munuera,Leganes
2281,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Away,L,0.0,1.0,Valencia,0.3,1.9,52.0,35971.0,Jonathan Viera,4-5-1,Jesús Gil,Las Palmas
2282,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Away,L,0.0,1.0,Leganés,1.2,1.4,46.0,9231.0,Manu García,4-4-2,José Munuera,Alaves
2283,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Home,W,1.0,0.0,Las Palmas,1.9,0.3,48.0,35971.0,Daniel Parejo,4-4-2,Jesús Gil,Valencia
2284,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,Away,D,1.0,1.0,Sevilla,1.1,2.1,38.0,30487.0,Gerard Moreno,4-4-2,Alejandro Hernández,Espanyol
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Home,D,0.0,0.0,Espanyol,2.0,0.4,40.0,17951.0,Víctor Díaz,4-3-3,Alejandro Hernández,Granada
6076,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,1.0,2.0,Atlético Madrid,2.3,0.8,61.0,23586.0,Asier Illarramendi,4-4-2◆,Jesús Gil,Real Sociedad
6077,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,0.0,2.0,Villarreal,0.7,0.7,70.0,54850.0,Sergio Busquets,4-3-3,José Luis Munuera,Barcelona
6078,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Home,L,0.0,1.0,Cádiz,0.4,1.9,49.0,11138.0,Rubén Duarte,4-2-3-1,José Sánchez,Alaves


## Convert data types
Make sure our data types in pandas are consistent with the database store

In [6]:
scores_by_team_df['_date_'] = pd.to_datetime(scores_by_team_df['_date_'], format = "%Y-%m-%d")
scores_by_team_df['_time_'] = pd.to_datetime(scores_by_team_df['_time_'],format = "%H:%M:%S").dt.time
scores_by_team_df['gf'] = scores_by_team_df['gf'].astype('int')
scores_by_team_df['ga'] = scores_by_team_df['ga'].astype('int')
scores_by_team_df['attendance'] = pd.to_numeric(scores_by_team_df['attendance'], errors = 'coerce', downcast= 'integer').astype('Int64')
scores_by_team_df.dtypes

_date_        datetime64[ns]
_time_                object
comp                  object
round                 object
_day_                 object
venue                 object
_result_              object
gf                     int32
ga                     int32
opponent              object
xg                   float64
xga                  float64
poss                 float64
attendance             Int64
captain               object
formation             object
referee               object
team                  object
dtype: object

## Check team names
Some of our team names are not consistent. We have to fix these

For example,

Notice how Atletico Madrid is different from Atlético Madrid

In [7]:
scores_by_team_df['team'].value_counts()

Real Madrid            190
Atletico Madrid        190
Villarreal             190
Real Betis             190
Athletic Club          190
Getafe                 190
Barcelona              190
Levante                190
Celta Vigo             190
Real Sociedad          190
Sevilla                190
Valencia               190
Alaves                 190
Espanyol               152
Eibar                  152
Osasuna                114
Granada                114
Valladolid             114
Leganes                114
Rayo Vallecano          76
Huesca                  76
Mallorca                76
Girona                  76
Cadiz                   76
Elche                   76
Malaga                  38
Las Palmas              38
Deportivo La Coruna     38
Name: team, dtype: int64

In [8]:
scores_by_team_df['opponent'].value_counts()

Alavés             190
Real Sociedad      190
Villarreal         190
Levante            190
Barcelona          190
Valencia           190
Getafe             190
Athletic Club      190
Real Madrid        190
Betis              190
Celta Vigo         190
Atlético Madrid    190
Sevilla            190
Espanyol           152
Eibar              152
Granada            114
Leganés            114
Valladolid         114
Osasuna            114
Rayo Vallecano      76
Cádiz               76
Mallorca            76
Elche               76
Huesca              76
Girona              76
Málaga              38
Las Palmas          38
La Coruña           38
Name: opponent, dtype: int64

In [9]:
scores_by_team_df['team'] = scores_by_team_df['team'].replace({'Alaves': 'Alavés', 'Almeria': 'Almería', 'Atletico Madrid' : 'Atlético Madrid', 'Real Betis' : 'Betis', 
                                               'Cadiz' : 'Cádiz', 'Cordoba': 'Córdoba', 'Gimnastic' : 'Gimnàstic', 'Deportivo La Coruna' : 'La Coruña',
                                              'Hercules' : 'Hércules', 'Leganes' : 'Leganés', 'Malaga' : 'Málaga', 'Racing Santander' : 'Racing Sant', 'Sporting Gijon' : 'Sporting Gijón'})

In [10]:

scores_df_merge = scores_by_team_df.merge(scores_by_team_df, left_on = ["_date_", "_time_", "comp", "_day_", "round", "team", "referee", "attendance"], right_on = ["_date_", "_time_", "comp", "_day_", "round", "opponent", "referee", "attendance"],
                                 suffixes = ("_team", "_opponent"), how = "inner")
scores_df_merge

Unnamed: 0,_date_,_time_,comp,round,_day_,venue_team,_result__team,gf_team,ga_team,opponent_team,...,_result__opponent,gf_opponent,ga_opponent,opponent_opponent,xg_opponent,xga_opponent,poss_opponent,captain_opponent,formation_opponent,team_opponent
0,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Alavés,...,L,0,1,Leganés,1.2,1.4,46.0,Manu García,4-4-2,Alavés
1,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Valencia,...,W,1,0,Las Palmas,1.9,0.3,48.0,Daniel Parejo,4-4-2,Valencia
2,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Leganés,...,W,1,0,Alavés,1.4,1.2,54.0,Martín Mantovani,4-2-3-1,Leganés
3,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Las Palmas,...,L,0,1,Valencia,0.3,1.9,52.0,Jonathan Viera,4-5-1,Las Palmas
4,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,Away,D,1,1,Sevilla,...,D,1,1,Espanyol,2.1,1.1,62.0,Jesús Navas,4-1-4-1,Sevilla
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Home,D,0,0,Espanyol,...,D,0,0,Granada,0.4,2.0,60.0,David López,4-4-2,Espanyol
3796,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,1,2,Atlético Madrid,...,W,2,1,Real Sociedad,0.8,2.3,39.0,Koke,3-5-2,Atlético Madrid
3797,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,0,2,Villarreal,...,W,2,0,Barcelona,0.7,0.7,30.0,Mario Gaspar,4-4-2,Villarreal
3798,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Home,L,0,1,Cádiz,...,W,1,0,Alavés,1.9,0.4,51.0,Álex Fernández,4-4-2,Cádiz


In [11]:
scores_df_merge['home'] = np.where(scores_df_merge['venue_team'] == 'Home', scores_df_merge['team_team'], scores_df_merge['opponent_team'])
scores_df_merge['away'] = np.where(scores_df_merge['team_team'] == scores_df_merge['home'], scores_df_merge['opponent_team'], scores_df_merge['team_team'])
scores_df_merge['homegoal'] = np.where(scores_df_merge['team_team'] == scores_df_merge['home'], scores_df_merge['gf_team'], scores_df_merge['ga_team'])
scores_df_merge['homegoal'] = scores_df_merge['homegoal'].astype(float).astype(int)
scores_df_merge['xg_home'] = np.where(scores_df_merge['team_team'] == scores_df_merge['home'], scores_df_merge['xg_team'], scores_df_merge['xga_team'])
scores_df_merge['awaygoal'] = np.where(scores_df_merge['team_team'] == scores_df_merge['away'], scores_df_merge['gf_team'], scores_df_merge['ga_team'])
scores_df_merge['awaygoal'] = scores_df_merge['awaygoal'].astype(float).astype(int)
scores_df_merge['xg_away'] = np.where(scores_df_merge['team_team'] == scores_df_merge['away'], scores_df_merge['xg_team'], scores_df_merge['xga_team'])
scores_df_merge['homeformation'] = np.where(scores_df_merge['team_team'] == scores_df_merge['home'], scores_df_merge['formation_team'], scores_df_merge['formation_opponent'])
scores_df_merge['awayformation'] = np.where(scores_df_merge['team_team'] == scores_df_merge['away'], scores_df_merge['formation_team'], scores_df_merge['formation_opponent'])
scores_df_merge['homepossession'] = np.where(scores_df_merge['team_team'] == scores_df_merge['home'], scores_df_merge['poss_team'], scores_df_merge['poss_opponent'])
scores_df_merge['awaypossession'] = np.where(scores_df_merge['team_team'] == scores_df_merge['away'], scores_df_merge['poss_team'], scores_df_merge['poss_opponent'])
scores_df_merge['homecaptain'] = np.where(scores_df_merge['team_team'] == scores_df_merge['home'], scores_df_merge['captain_team'], scores_df_merge['captain_opponent'])
scores_df_merge['awaycaptain'] = np.where(scores_df_merge['team_team'] == scores_df_merge['away'], scores_df_merge['captain_team'], scores_df_merge['captain_opponent'])

scores_df_merge

Unnamed: 0,_date_,_time_,comp,round,_day_,venue_team,_result__team,gf_team,ga_team,opponent_team,...,homegoal,xg_home,awaygoal,xg_away,homeformation,awayformation,homepossession,awaypossession,homecaptain,awaycaptain
0,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Alavés,...,1,1.4,0,1.2,4-2-3-1,4-4-2,54.0,46.0,Martín Mantovani,Manu García
1,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Valencia,...,1,1.9,0,0.3,4-4-2,4-5-1,48.0,52.0,Daniel Parejo,Jonathan Viera
2,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Leganés,...,1,1.4,0,1.2,4-2-3-1,4-4-2,54.0,46.0,Martín Mantovani,Manu García
3,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Las Palmas,...,1,1.9,0,0.3,4-4-2,4-5-1,48.0,52.0,Daniel Parejo,Jonathan Viera
4,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,Away,D,1,1,Sevilla,...,1,2.1,1,1.1,4-1-4-1,4-4-2,62.0,38.0,Jesús Navas,Gerard Moreno
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Home,D,0,0,Espanyol,...,0,2.0,0,0.4,4-3-3,4-4-2,40.0,60.0,Víctor Díaz,David López
3796,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,1,2,Atlético Madrid,...,1,2.3,2,0.8,4-4-2◆,3-5-2,61.0,39.0,Asier Illarramendi,Koke
3797,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,0,2,Villarreal,...,0,0.7,2,0.7,4-3-3,4-4-2,70.0,30.0,Sergio Busquets,Mario Gaspar
3798,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Home,L,0,1,Cádiz,...,0,0.4,1,1.9,4-2-3-1,4-4-2,49.0,51.0,Rubén Duarte,Álex Fernández


In [12]:
# Drop unnecessary columns
scores_df_merge = scores_df_merge.loc[:, ["_date_", "_time_", "comp", "round", "_day_", "attendance", "referee", "home", "away", "homegoal",
                        "xg_home", "awaygoal", "xg_away", "homeformation", "awayformation", "homepossession", "awaypossession",
                        "homecaptain", "awaycaptain"]]
scores_df_merge

Unnamed: 0,_date_,_time_,comp,round,_day_,attendance,referee,home,away,homegoal,xg_home,awaygoal,xg_away,homeformation,awayformation,homepossession,awaypossession,homecaptain,awaycaptain
0,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,9231,José Munuera,Leganés,Alavés,1,1.4,0,1.2,4-2-3-1,4-4-2,54.0,46.0,Martín Mantovani,Manu García
1,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,35971,Jesús Gil,Valencia,Las Palmas,1,1.9,0,0.3,4-4-2,4-5-1,48.0,52.0,Daniel Parejo,Jonathan Viera
2,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,9231,José Munuera,Leganés,Alavés,1,1.4,0,1.2,4-2-3-1,4-4-2,54.0,46.0,Martín Mantovani,Manu García
3,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,35971,Jesús Gil,Valencia,Las Palmas,1,1.9,0,0.3,4-4-2,4-5-1,48.0,52.0,Daniel Parejo,Jonathan Viera
4,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,30487,Alejandro Hernández,Sevilla,Espanyol,1,2.1,1,1.1,4-1-4-1,4-4-2,62.0,38.0,Jesús Navas,Gerard Moreno
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,17951,Alejandro Hernández,Granada,Espanyol,0,2.0,0,0.4,4-3-3,4-4-2,40.0,60.0,Víctor Díaz,David López
3796,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,23586,Jesús Gil,Real Sociedad,Atlético Madrid,1,2.3,2,0.8,4-4-2◆,3-5-2,61.0,39.0,Asier Illarramendi,Koke
3797,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,54850,José Luis Munuera,Barcelona,Villarreal,0,0.7,2,0.7,4-3-3,4-4-2,70.0,30.0,Sergio Busquets,Mario Gaspar
3798,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,11138,José Sánchez,Alavés,Cádiz,0,0.4,1,1.9,4-2-3-1,4-4-2,49.0,51.0,Rubén Duarte,Álex Fernández


In [13]:
scores_df_merge_final = scores_df_merge.drop_duplicates()
scores_df_merge_final

Unnamed: 0,_date_,_time_,comp,round,_day_,attendance,referee,home,away,homegoal,xg_home,awaygoal,xg_away,homeformation,awayformation,homepossession,awaypossession,homecaptain,awaycaptain
0,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,9231,José Munuera,Leganés,Alavés,1,1.4,0,1.2,4-2-3-1,4-4-2,54.0,46.0,Martín Mantovani,Manu García
1,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,35971,Jesús Gil,Valencia,Las Palmas,1,1.9,0,0.3,4-4-2,4-5-1,48.0,52.0,Daniel Parejo,Jonathan Viera
4,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,30487,Alejandro Hernández,Sevilla,Espanyol,1,2.1,1,1.1,4-1-4-1,4-4-2,62.0,38.0,Jesús Navas,Gerard Moreno
5,2017-08-19,20:15:00,La Liga,Matchweek 1,Sat,11511,Juan Martínez,Girona,Atlético Madrid,2,2.1,2,0.8,3-4-3,4-4-2,53.0,47.0,Álex Granell,Gabi
7,2017-08-19,18:15:00,La Liga,Matchweek 1,Sat,16961,Antonio Matéu Lahoz,Celta Vigo,Real Sociedad,2,1.6,3,2.4,4-3-3,4-3-3,53.0,47.0,Hugo Mallo,Xabi Prieto
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3788,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,31305,Carlos del Cerro,Sevilla,Athletic Club,1,1.4,0,0.7,4-3-3,4-4-2,65.0,35.0,Jesús Navas,Iker Muniain
3790,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,18717,César Soto,Osasuna,Mallorca,0,0.8,2,1.3,4-3-3,3-5-2,62.0,38.0,Oier Sanjurjo,Manolo Reina
3791,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,54850,José Luis Munuera,Barcelona,Villarreal,0,0.7,2,0.7,4-3-3,4-4-2,70.0,30.0,Sergio Busquets,Mario Gaspar
3794,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,17951,Alejandro Hernández,Granada,Espanyol,0,2.0,0,0.4,4-3-3,4-4-2,40.0,60.0,Víctor Díaz,David López


In [14]:
scores_df_merge_final.groupby(scores_df_merge_final['_date_'].dt.year).agg({'count'})['_date_']

Unnamed: 0_level_0,count
_date_,Unnamed: 1_level_1
2017,169
2018,380
2019,391
2020,355
2021,407
2022,198


In [15]:
os.path.realpath('../data/preprocessed/')

'C:\\Users\\ntlg4\\PycharmProjects\\laliga\\data\\preprocessed'

In [16]:
scores_df_merge_final.to_pickle(os.path.join(os.path.realpath('../data/preprocessed/'), 'scores_by_team_merge.pkl'))