In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pathlib import Path
import warnings
import psycopg2
from sqlalchemy import create_engine
warnings.filterwarnings('ignore')
import sys

sys.path.insert(0, os.path.realpath('..\src'))
from database.database_config import DB_NAME, DB_USER, DB_PASSWORD, DB_HOST

In [2]:
db = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}")
conn = db.connect()

In [3]:
shooting_df = pd.read_sql("select * from laligadb.laliga.shooting order by _date_", con=conn)
shooting_df

Unnamed: 0,_date_,_time_,comp,round,_day_,venue,_result_,gf,ga,opponent,...,distance,free_kicks,penalty_kicks,penalty_kicks_attempt,xg,nonpenalty_xg,nonpenalty_xg_per_shot,goals_minus_xg,nonpenalty_goals_minus_xg,team
0,2014-08-23,19:00:00,La Liga,Matchweek 1,Sat,Home,W,1,0,Athletic Club,...,,,0,1,,,,,,Malaga
1,2014-08-23,21:00:00,La Liga,Matchweek 1,Sat,Home,D,1,1,Valencia,...,,,0,0,,,,,,Sevilla
2,2014-08-23,23:00:00,La Liga,Matchweek 1,Sat,Home,D,1,1,Espanyol,...,,,0,0,,,,,,Almeria
3,2014-08-23,21:00:00,La Liga,Matchweek 1,Sat,Away,D,1,1,Sevilla,...,,,0,0,,,,,,Valencia
4,2014-08-23,23:00:00,La Liga,Matchweek 1,Sat,Away,D,1,1,Almería,...,,,0,0,,,,,,Espanyol
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6217,2022-10-02,18:30:00,La Liga,Matchweek 7,Sun,Home,L,3,5,Real Sociedad,...,16.7,0.0,0,0,1.1,1.1,0.07,1.9,1.9,Girona
6218,2022-10-03,21:00:00,La Liga,Matchweek 7,Mon,Home,W,2,1,Elche,...,18.5,0.0,0,0,0.8,0.8,0.05,1.2,1.2,Rayo Vallecano
6219,2022-10-03,21:00:00,La Liga,Matchweek 7,Mon,Away,L,1,2,Rayo Vallecano,...,16.3,0.0,0,0,0.6,0.6,0.11,0.4,0.4,Elche
6220,2022-10-07,21:00:00,La Liga,Matchweek 8,Fri,Away,W,2,1,Osasuna,...,19.8,0.0,0,1,2.2,1.5,0.10,-0.2,0.5,Valencia


In [4]:
start_date = pd.to_datetime('2017-08-01').date()
end_date = pd.to_datetime('2022-05-30').date()
mask = (shooting_df['_date_'] > start_date) & (shooting_df['_date_'] <= end_date)
shooting_df = shooting_df.loc[mask]
shooting_df

Unnamed: 0,_date_,_time_,comp,round,_day_,venue,_result_,gf,ga,opponent,...,distance,free_kicks,penalty_kicks,penalty_kicks_attempt,xg,nonpenalty_xg,nonpenalty_xg_per_shot,goals_minus_xg,nonpenalty_goals_minus_xg,team
2280,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Alavés,...,20.8,2.0,0,0,1.4,1.4,0.10,-0.4,-0.4,Leganes
2281,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Las Palmas,...,15.9,2.0,0,0,1.9,1.9,0.09,-0.9,-0.9,Valencia
2282,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Valencia,...,22.9,1.0,0,0,0.3,0.3,0.06,-0.3,-0.3,Las Palmas
2283,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Leganés,...,21.7,0.0,0,1,1.2,0.4,0.05,-1.2,-0.4,Alaves
2284,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,Away,D,1,1,Sevilla,...,16.1,0.0,0,0,1.1,1.1,0.13,-0.1,-0.1,Espanyol
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,0,2,Villarreal,...,19.1,0.0,0,0,0.7,0.7,0.05,-0.7,-0.7,Barcelona
6076,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Away,W,2,0,Barcelona,...,12.5,0.0,0,0,0.7,0.7,0.17,1.3,1.3,Villarreal
6077,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Home,L,0,2,Mallorca,...,14.6,0.0,0,0,0.8,0.8,0.07,-0.8,-0.8,Osasuna
6078,2022-05-22,17:30:00,La Liga,Matchweek 38,Sun,Away,L,1,3,Elche,...,20.3,1.0,0,0,1.3,1.3,0.16,-0.3,-0.3,Getafe


In [5]:
shooting_df['_date_'] = pd.to_datetime(shooting_df['_date_'], format = "%Y-%m-%d")
shooting_df['_time_'] = pd.to_datetime(shooting_df['_time_'],format = "%H:%M:%S").dt.time
shooting_df['gf'] = pd.to_numeric(shooting_df['gf'], errors = 'coerce', downcast= 'integer')
shooting_df['ga'] = pd.to_numeric(shooting_df['ga'], errors = 'coerce', downcast= 'integer')
shooting_df['shots'] = pd.to_numeric(shooting_df['shots'], errors = 'coerce', downcast= 'integer').astype('Int64')
shooting_df['shots_on_target'] = pd.to_numeric(shooting_df['shots_on_target'], errors = 'coerce', downcast= 'integer').astype('Int64')
shooting_df['free_kicks'] = pd.to_numeric(shooting_df['free_kicks'], errors = 'coerce', downcast= 'integer').astype('Int64')
shooting_df['penalty_kicks'] = pd.to_numeric(shooting_df['penalty_kicks'], errors = 'coerce', downcast= 'integer').astype('Int64')
shooting_df['penalty_kicks_attempt'] = pd.to_numeric(shooting_df['penalty_kicks_attempt'], errors = 'coerce', downcast= 'integer').astype('Int64')
# shooting_df['attendance'] = shooting_df['attendance'].astype('int')
shooting_df.dtypes

_date_                       datetime64[ns]
_time_                               object
comp                                 object
round                                object
_day_                                object
venue                                object
_result_                             object
gf                                     int8
ga                                     int8
opponent                             object
goals                                 int64
shots                                 Int64
shots_on_target                       Int64
shots_on_target_percent             float64
goals_per_shot                      float64
goals_per_shot_on_target            float64
distance                            float64
free_kicks                            Int64
penalty_kicks                         Int64
penalty_kicks_attempt                 Int64
xg                                  float64
nonpenalty_xg                       float64
nonpenalty_xg_per_shot          

In [6]:
shooting_df['team'] = shooting_df['team'].replace({'Alaves': 'Alavés', 'Almeria': 'Almería', 'Atletico Madrid' : 'Atlético Madrid', 'Real Betis' : 'Betis', 
                                               'Cadiz' : 'Cádiz', 'Cordoba': 'Córdoba', 'Gimnastic' : 'Gimnàstic', 'Deportivo La Coruna' : 'La Coruña',
                                              'Hercules' : 'Hércules', 'Leganes' : 'Leganés', 'Malaga' : 'Málaga', 'Racing Santander' : 'Racing Sant', 'Sporting Gijon' : 'Sporting Gijón'})

In [7]:
shooting_merge = shooting_df.merge(shooting_df, left_on = ["_date_", "_time_", "comp", "_day_", "round", "team"], right_on = ["_date_", "_time_", "comp", "_day_", "round", "opponent"],
                                 suffixes = ("_team", "_opponent"), how = "inner")
shooting_merge

Unnamed: 0,_date_,_time_,comp,round,_day_,venue_team,_result__team,gf_team,ga_team,opponent_team,...,distance_opponent,free_kicks_opponent,penalty_kicks_opponent,penalty_kicks_attempt_opponent,xg_opponent,nonpenalty_xg_opponent,nonpenalty_xg_per_shot_opponent,goals_minus_xg_opponent,nonpenalty_goals_minus_xg_opponent,team_opponent
0,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Alavés,...,21.7,0,0,1,1.2,0.4,0.05,-1.2,-0.4,Alavés
1,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Las Palmas,...,22.9,1,0,0,0.3,0.3,0.06,-0.3,-0.3,Las Palmas
2,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Valencia,...,15.9,2,0,0,1.9,1.9,0.09,-0.9,-0.9,Valencia
3,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Leganés,...,20.8,2,0,0,1.4,1.4,0.10,-0.4,-0.4,Leganés
4,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,Away,D,1,1,Sevilla,...,11.6,0,0,0,2.1,2.1,0.24,-1.1,-1.1,Sevilla
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,0,2,Villarreal,...,12.5,0,0,0,0.7,0.7,0.17,1.3,1.3,Villarreal
3796,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Away,W,2,0,Barcelona,...,19.1,0,0,0,0.7,0.7,0.05,-0.7,-0.7,Barcelona
3797,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Home,L,0,2,Mallorca,...,18.7,1,0,0,1.3,1.3,0.10,0.7,0.7,Mallorca
3798,2022-05-22,17:30:00,La Liga,Matchweek 38,Sun,Away,L,1,3,Elche,...,16.7,0,0,0,2.5,2.5,0.14,0.5,0.5,Elche


In [8]:
shooting_merge['home'] = np.where(shooting_merge['venue_team'] == 'Home', shooting_merge['team_team'], shooting_merge['opponent_team'])
shooting_merge['away'] = np.where(shooting_merge['team_team'] == shooting_merge['home'], shooting_merge['opponent_team'], shooting_merge['team_team'])
shooting_merge['homegoal'] = np.where(shooting_merge['team_team'] == shooting_merge['home'], shooting_merge['gf_team'], shooting_merge['ga_team'])
shooting_merge['homegoal'] = shooting_merge['homegoal'].astype(float).astype(int)
shooting_merge['xg_home'] = np.where(shooting_merge['team_team'] == shooting_merge['home'], shooting_merge['xg_team'], shooting_merge['xg_opponent'])
shooting_merge['homeshots'] = np.where(shooting_merge['team_team'] == shooting_merge['home'], shooting_merge['shots_team'], shooting_merge['shots_opponent'])
shooting_merge['homeshots_on_target'] = np.where(shooting_merge['team_team'] == shooting_merge['home'], shooting_merge['shots_on_target_team'], shooting_merge['shots_on_target_opponent'])
shooting_merge['home_distance'] = np.where(shooting_merge['team_team'] == shooting_merge['home'], shooting_merge['distance_team'], shooting_merge['distance_opponent'])
shooting_merge['home_freekicks'] = np.where(shooting_merge['team_team'] == shooting_merge['home'], shooting_merge['free_kicks_team'], shooting_merge['free_kicks_opponent'])
shooting_merge['home_penkicks'] = np.where(shooting_merge['team_team'] == shooting_merge['home'], shooting_merge['penalty_kicks_team'], shooting_merge['penalty_kicks_opponent'])
shooting_merge['home_penkicksattempt'] = np.where(shooting_merge['team_team'] == shooting_merge['home'], shooting_merge['penalty_kicks_attempt_team'], shooting_merge['penalty_kicks_attempt_opponent'])
shooting_merge['awaygoal'] = np.where(shooting_merge['team_team'] == shooting_merge['away'], shooting_merge['gf_team'], shooting_merge['ga_team'])
shooting_merge['awaygoal'] = shooting_merge['awaygoal'].astype(float).astype(int)
shooting_merge['xg_away'] = np.where(shooting_merge['team_team'] == shooting_merge['away'], shooting_merge['xg_team'], shooting_merge['xg_opponent'])
shooting_merge['awayshots'] = np.where(shooting_merge['team_team'] == shooting_merge['away'], shooting_merge['shots_team'], shooting_merge['shots_opponent'])
shooting_merge['awayshots_on_target'] = np.where(shooting_merge['team_team'] == shooting_merge['away'], shooting_merge['shots_on_target_team'], shooting_merge['shots_on_target_opponent'])
shooting_merge['away_distance'] = np.where(shooting_merge['team_team'] == shooting_merge['away'], shooting_merge['distance_team'], shooting_merge['distance_opponent'])
shooting_merge['away_freekicks'] = np.where(shooting_merge['team_team'] == shooting_merge['away'], shooting_merge['free_kicks_team'], shooting_merge['free_kicks_opponent'])
shooting_merge['away_penkicks'] = np.where(shooting_merge['team_team'] == shooting_merge['away'], shooting_merge['penalty_kicks_team'], shooting_merge['penalty_kicks_opponent'])
shooting_merge['away_penkicksattempt'] = np.where(shooting_merge['team_team'] == shooting_merge['away'], shooting_merge['penalty_kicks_attempt_team'], shooting_merge['penalty_kicks_attempt_opponent'])

shooting_merge

Unnamed: 0,_date_,_time_,comp,round,_day_,venue_team,_result__team,gf_team,ga_team,opponent_team,...,home_penkicks,home_penkicksattempt,awaygoal,xg_away,awayshots,awayshots_on_target,away_distance,away_freekicks,away_penkicks,away_penkicksattempt
0,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Alavés,...,0,0,0,1.2,8,3,21.7,0,0,1
1,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Las Palmas,...,0,0,0,0.3,5,4,22.9,1,0,0
2,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Valencia,...,0,0,0,0.3,5,4,22.9,1,0,0
3,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Leganés,...,0,0,0,1.2,8,3,21.7,0,0,1
4,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,Away,D,1,1,Sevilla,...,0,0,1,1.1,9,6,16.1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,0,2,Villarreal,...,0,0,2,0.7,4,3,12.5,0,0,0
3796,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Away,W,2,0,Barcelona,...,0,0,2,0.7,4,3,12.5,0,0,0
3797,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Home,L,0,2,Mallorca,...,0,0,2,1.3,14,6,18.7,1,0,0
3798,2022-05-22,17:30:00,La Liga,Matchweek 38,Sun,Away,L,1,3,Elche,...,0,0,1,1.3,8,2,20.3,1,0,0


In [9]:
# Drop unnecessary columns
shooting_merge = shooting_merge.loc[:, ["_date_", "_time_", "comp", "round", "_day_", "home", "away", "homegoal",
                        "xg_home", "homeshots", "homeshots_on_target", "home_distance", "home_freekicks", "home_penkicks", "home_penkicksattempt",
                        "awaygoal", "xg_away", "awayshots", "awayshots_on_target", "away_distance", "away_freekicks", "away_penkicks", "away_penkicksattempt"]]
shooting_merge

Unnamed: 0,_date_,_time_,comp,round,_day_,home,away,homegoal,xg_home,homeshots,...,home_penkicks,home_penkicksattempt,awaygoal,xg_away,awayshots,awayshots_on_target,away_distance,away_freekicks,away_penkicks,away_penkicksattempt
0,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Leganés,Alavés,1,1.4,14,...,0,0,0,1.2,8,3,21.7,0,0,1
1,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Valencia,Las Palmas,1,1.9,22,...,0,0,0,0.3,5,4,22.9,1,0,0
2,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Valencia,Las Palmas,1,1.9,22,...,0,0,0,0.3,5,4,22.9,1,0,0
3,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Leganés,Alavés,1,1.4,14,...,0,0,0,1.2,8,3,21.7,0,0,1
4,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,Sevilla,Espanyol,1,2.1,9,...,0,0,1,1.1,9,6,16.1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Barcelona,Villarreal,0,0.7,15,...,0,0,2,0.7,4,3,12.5,0,0,0
3796,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Barcelona,Villarreal,0,0.7,15,...,0,0,2,0.7,4,3,12.5,0,0,0
3797,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Osasuna,Mallorca,0,0.8,11,...,0,0,2,1.3,14,6,18.7,1,0,0
3798,2022-05-22,17:30:00,La Liga,Matchweek 38,Sun,Elche,Getafe,3,2.5,19,...,0,0,1,1.3,8,2,20.3,1,0,0


In [10]:
shooting_merge_final = shooting_merge.drop_duplicates()
shooting_merge_final

Unnamed: 0,_date_,_time_,comp,round,_day_,home,away,homegoal,xg_home,homeshots,...,home_penkicks,home_penkicksattempt,awaygoal,xg_away,awayshots,awayshots_on_target,away_distance,away_freekicks,away_penkicks,away_penkicksattempt
0,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Leganés,Alavés,1,1.4,14,...,0,0,0,1.2,8,3,21.7,0,0,1
1,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Valencia,Las Palmas,1,1.9,22,...,0,0,0,0.3,5,4,22.9,1,0,0
4,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,Sevilla,Espanyol,1,2.1,9,...,0,0,1,1.1,9,6,16.1,0,0,0
5,2017-08-19,20:15:00,La Liga,Matchweek 1,Sat,Girona,Atlético Madrid,2,2.1,14,...,0,0,2,0.8,9,3,18.0,0,0,0
6,2017-08-19,18:15:00,La Liga,Matchweek 1,Sat,Celta Vigo,Real Sociedad,2,1.6,16,...,0,0,3,2.4,13,5,17.5,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3788,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Granada,Espanyol,0,2.0,14,...,0,1,0,0.4,10,5,19.5,0,0,0
3789,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Alavés,Cádiz,0,0.4,9,...,0,0,1,1.9,13,6,17.9,0,0,0
3790,2022-05-22,17:30:00,La Liga,Matchweek 38,Sun,Elche,Getafe,3,2.5,19,...,0,0,1,1.3,8,2,20.3,1,0,0
3794,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Osasuna,Mallorca,0,0.8,11,...,0,0,2,1.3,14,6,18.7,1,0,0


In [11]:
shooting_merge_final.groupby(shooting_merge_final['_date_'].dt.year).agg({'count'})['_date_']

Unnamed: 0_level_0,count
_date_,Unnamed: 1_level_1
2017,169
2018,380
2019,391
2020,355
2021,407
2022,198


In [12]:
shooting_merge_final.to_pickle(os.path.join(os.path.realpath('../data/preprocessed/'), 'shooting_merge.pkl'))

In [13]:
shooting_merge_final.to_csv(os.path.join(os.path.realpath('../data/preprocessed/'), 'shooting_merge.csv'), index = False)