In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pathlib import Path
import warnings
import psycopg2
from sqlalchemy import create_engine
warnings.filterwarnings('ignore')
import sys

sys.path.insert(0, os.path.realpath('..\src'))
from database.database_config import DB_NAME, DB_USER, DB_PASSWORD, DB_HOST

In [2]:
db = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}")
conn = db.connect()

In [3]:
possession_df = pd.read_sql("select * from laligadb.laliga.possession order by _date_", con=conn)
possession_df

Unnamed: 0,_date_,_time_,comp,round,_day_,venue,_result_,gf,ga,opponent,...,carries_prog,carries_one_third,carries_cpa,carries_miss,carries_dis,receiving_target,receiving_rec,receiving_rec_percent,receiving_prog,team
0,2014-08-23,21:00:00,La Liga,Matchweek 1,Sat,Away,D,1,1,Sevilla,...,,,,,,,,,,Valencia
1,2014-08-23,23:00:00,La Liga,Matchweek 1,Sat,Home,D,1,1,Espanyol,...,,,,,,,,,,Almeria
2,2014-08-23,21:00:00,La Liga,Matchweek 1,Sat,Away,L,1,2,Granada,...,,,,,,,,,,Deportivo La Coruna
3,2014-08-23,21:00:00,La Liga,Matchweek 1,Sat,Home,D,1,1,Valencia,...,,,,,,,,,,Sevilla
4,2014-08-23,21:00:00,La Liga,Matchweek 1,Sat,Home,W,2,1,La Coruña,...,,,,,,,,,,Granada
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5457,2022-10-02,14:00:00,La Liga,Matchweek 7,Sun,Home,D,2,2,Valencia,...,14.0,9.0,2.0,8.0,10.0,311.0,228.0,73.3,27.0,Espanyol
5458,2022-10-03,21:00:00,La Liga,Matchweek 7,Mon,Home,W,2,1,Elche,...,65.0,21.0,1.0,18.0,11.0,553.0,493.0,89.2,53.0,Rayo Vallecano
5459,2022-10-03,21:00:00,La Liga,Matchweek 7,Mon,Away,L,1,2,Rayo Vallecano,...,18.0,8.0,3.0,14.0,15.0,202.0,167.0,82.7,14.0,Elche
5460,2022-10-07,21:00:00,La Liga,Matchweek 8,Fri,Home,L,1,2,Valencia,...,9.0,3.0,1.0,26.0,11.0,276.0,247.0,89.5,18.0,Osasuna


In [4]:
start_date = pd.to_datetime('2017-08-01').date()
end_date = pd.to_datetime('2022-05-30').date()
mask = (possession_df['_date_'] > start_date) & (possession_df['_date_'] <= end_date)
possession_df = possession_df.loc[mask]
possession_df

Unnamed: 0,_date_,_time_,comp,round,_day_,venue,_result_,gf,ga,opponent,...,carries_prog,carries_one_third,carries_cpa,carries_miss,carries_dis,receiving_target,receiving_rec,receiving_rec_percent,receiving_prog,team
1520,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Leganés,...,21.0,5.0,1.0,15.0,15.0,309.0,250.0,80.9,23.0,Alaves
1521,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Valencia,...,52.0,15.0,2.0,15.0,12.0,499.0,451.0,90.4,22.0,Las Palmas
1522,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Las Palmas,...,61.0,21.0,12.0,13.0,6.0,468.0,402.0,85.9,41.0,Valencia
1523,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Alavés,...,24.0,8.0,1.0,24.0,17.0,352.0,305.0,86.6,26.0,Leganes
1524,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,Away,D,1,1,Sevilla,...,32.0,17.0,3.0,5.0,12.0,401.0,297.0,74.1,18.0,Espanyol
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,0,2,Villarreal,...,88.0,25.0,12.0,14.0,5.0,693.0,647.0,93.4,45.0,Barcelona
5316,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Away,W,2,0,Osasuna,...,27.0,9.0,4.0,14.0,5.0,270.0,211.0,78.1,23.0,Mallorca
5317,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Home,L,0,1,Cádiz,...,41.0,8.0,4.0,12.0,10.0,402.0,356.0,88.6,26.0,Alaves
5318,2022-05-22,17:30:00,La Liga,Matchweek 38,Sun,Away,L,1,3,Elche,...,25.0,7.0,2.0,16.0,13.0,366.0,316.0,86.3,13.0,Getafe


In [5]:
possession_df['_date_'] = pd.to_datetime(possession_df['_date_'], format = "%Y-%m-%d")
possession_df['_time_'] = pd.to_datetime(possession_df['_time_'],format = "%H:%M:%S").dt.time
possession_df['gf'] = pd.to_numeric(possession_df['gf'], errors = 'coerce', downcast= 'integer')
possession_df['ga'] = pd.to_numeric(possession_df['ga'], errors = 'coerce', downcast= 'integer')
possession_df['poss'] = pd.to_numeric(possession_df['poss'], errors = 'coerce', downcast= 'integer')
possession_df['touches'] = pd.to_numeric(possession_df['touches'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['touches_def_pen'] = pd.to_numeric(possession_df['touches_def_pen'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['touches_def_third'] = pd.to_numeric(possession_df['touches_def_third'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['touches_mid_third'] = pd.to_numeric(possession_df['touches_mid_third'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['touches_att_third'] = pd.to_numeric(possession_df['touches_att_third'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['touches_att_pen'] = pd.to_numeric(possession_df['touches_att_pen'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['touches_live'] = pd.to_numeric(possession_df['touches_live'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['dribbles_succ'] = pd.to_numeric(possession_df['dribbles_succ'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['dribbles_att'] = pd.to_numeric(possession_df['dribbles_att'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['dribbles_players_num'] = pd.to_numeric(possession_df['dribbles_players_num'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['dribbles_megs'] = pd.to_numeric(possession_df['dribbles_megs'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['carries'] = pd.to_numeric(possession_df['carries'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['carries_tot_dist'] = pd.to_numeric(possession_df['carries_tot_dist'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['carries_prog_dist'] = pd.to_numeric(possession_df['carries_prog_dist'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['carries_prog'] = pd.to_numeric(possession_df['carries_prog'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['carries_one_third'] = pd.to_numeric(possession_df['carries_one_third'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['carries_cpa'] = pd.to_numeric(possession_df['carries_cpa'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['carries_miss'] = pd.to_numeric(possession_df['carries_miss'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['carries_dis'] = pd.to_numeric(possession_df['carries_dis'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['receiving_target'] = pd.to_numeric(possession_df['receiving_target'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['receiving_rec'] = pd.to_numeric(possession_df['receiving_rec'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df['receiving_prog'] = pd.to_numeric(possession_df['receiving_prog'], errors = 'coerce', downcast= 'integer').astype('Int64')
possession_df.dtypes

_date_                   datetime64[ns]
_time_                           object
comp                             object
round                            object
_day_                            object
venue                            object
_result_                         object
gf                                 int8
ga                                 int8
opponent                         object
poss                               int8
touches                           Int64
touches_def_pen                   Int64
touches_def_third                 Int64
touches_mid_third                 Int64
touches_att_third                 Int64
touches_att_pen                   Int64
touches_live                      Int64
dribbles_succ                     Int64
dribbles_att                      Int64
dribbles_succ_percent           float64
dribbles_players_num              Int64
dribbles_megs                     Int64
carries                           Int64
carries_tot_dist                  Int64


In [6]:
possession_df['team'] = possession_df['team'].replace({'Alaves': 'Alavés', 'Almeria': 'Almería', 'Atletico Madrid' : 'Atlético Madrid', 'Real Betis' : 'Betis', 
                                               'Cadiz' : 'Cádiz', 'Cordoba': 'Córdoba', 'Gimnastic' : 'Gimnàstic', 'Deportivo La Coruna' : 'La Coruña',
                                              'Hercules' : 'Hércules', 'Leganes' : 'Leganés', 'Malaga' : 'Málaga', 'Racing Santander' : 'Racing Sant', 'Sporting Gijon' : 'Sporting Gijón'})

In [7]:
possession_merge = possession_df.merge(possession_df, left_on = ["_date_", "_time_", "comp", "_day_", "round", "team"], right_on = ["_date_", "_time_", "comp", "_day_", "round", "opponent"],
                                 suffixes = ("_team", "_opponent"), how = "inner")
possession_merge

Unnamed: 0,_date_,_time_,comp,round,_day_,venue_team,_result__team,gf_team,ga_team,opponent_team,...,carries_prog_opponent,carries_one_third_opponent,carries_cpa_opponent,carries_miss_opponent,carries_dis_opponent,receiving_target_opponent,receiving_rec_opponent,receiving_rec_percent_opponent,receiving_prog_opponent,team_opponent
0,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Leganés,...,24,8,1,24,17,352,305,86.6,26,Leganés
1,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Valencia,...,61,21,12,13,6,468,402,85.9,41,Valencia
2,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Las Palmas,...,52,15,2,15,12,499,451,90.4,22,Las Palmas
3,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Alavés,...,21,5,1,15,15,309,250,80.9,23,Alavés
4,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,Away,D,1,1,Sevilla,...,51,9,4,7,13,666,556,83.5,36,Sevilla
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,0,2,Villarreal,...,14,2,4,12,5,300,254,84.7,18,Villarreal
3796,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Away,W,2,0,Osasuna,...,50,14,1,9,14,492,406,82.5,40,Osasuna
3797,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Home,L,0,1,Cádiz,...,33,18,3,12,9,408,357,87.5,24,Cádiz
3798,2022-05-22,17:30:00,La Liga,Matchweek 38,Sun,Away,L,1,3,Elche,...,58,18,5,11,6,514,472,91.8,43,Elche


In [8]:
possession_merge['home'] = np.where(possession_merge['venue_team'] == 'Home', possession_merge['team_team'], possession_merge['opponent_team'])
possession_merge['homegoal'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['gf_team'], possession_merge['ga_team'])
possession_merge['homegoal'] = possession_merge['homegoal'].astype(float).astype(int)
possession_merge['homepossession'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['poss_team'], possession_merge['poss_opponent'])
possession_merge['home_touches'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['touches_team'], possession_merge['touches_opponent'])
possession_merge['home_touches_def_pen'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['touches_def_pen_team'], possession_merge['touches_def_pen_opponent'])
possession_merge['home_touches_def_third'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['touches_def_third_team'], possession_merge['touches_def_third_opponent'])
possession_merge['home_touches_mid_third'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['touches_mid_third_team'], possession_merge['touches_mid_third_opponent'])
possession_merge['home_touches_att_third'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['touches_att_third_team'], possession_merge['touches_att_third_opponent'])
possession_merge['home_touches_att_pen'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['touches_att_pen_team'], possession_merge['touches_att_pen_opponent'])
possession_merge['home_touches_live'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['touches_live_team'], possession_merge['touches_live_opponent'])
possession_merge['home_dribbles_success'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['dribbles_succ_team'], possession_merge['dribbles_succ_opponent'])
possession_merge['home_dribbles_att'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['dribbles_att_team'], possession_merge['dribbles_att_opponent'])
possession_merge['home_dribbles_players_number'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['dribbles_players_num_team'], possession_merge['dribbles_players_num_opponent'])
possession_merge['home_dribbles_megs'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['dribbles_megs_team'], possession_merge['dribbles_megs_opponent'])
possession_merge['home_carries'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['carries_team'], possession_merge['carries_opponent'])
possession_merge['home_carries_tot_dist'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['carries_tot_dist_team'], possession_merge['carries_tot_dist_opponent'])
possession_merge['home_carries_prog_dist'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['carries_prog_dist_team'], possession_merge['carries_prog_dist_opponent'])
possession_merge['home_carries_prog'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['carries_prog_team'], possession_merge['carries_prog_opponent'])
possession_merge['home_carries_one_third'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['carries_one_third_team'], possession_merge['carries_one_third_opponent'])
possession_merge['home_carries_cpa'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['carries_cpa_team'], possession_merge['carries_cpa_opponent'])
possession_merge['home_carries_miss'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['carries_miss_team'], possession_merge['carries_miss_opponent'])
possession_merge['home_carries_dis'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['carries_dis_team'], possession_merge['carries_dis_opponent'])
possession_merge['home_receiving_target'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['receiving_target_team'], possession_merge['receiving_target_opponent'])
possession_merge['home_receiving_rec'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['receiving_rec_team'], possession_merge['receiving_rec_opponent'])
possession_merge['home_receiving_prog'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['receiving_rec_team'], possession_merge['receiving_rec_opponent'])


possession_merge['away'] = np.where(possession_merge['team_team'] == possession_merge['home'], possession_merge['opponent_team'], possession_merge['team_team'])
possession_merge['awaygoal'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['gf_team'], possession_merge['ga_team'])
possession_merge['awaygoal'] = possession_merge['awaygoal'].astype(float).astype(int)
possession_merge['awaypossession'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['poss_team'], possession_merge['poss_opponent'])
possession_merge['away_touches'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['touches_team'], possession_merge['touches_opponent'])
possession_merge['away_touches_def_pen'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['touches_def_pen_team'], possession_merge['touches_def_pen_opponent'])
possession_merge['away_touches_def_third'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['touches_def_third_team'], possession_merge['touches_def_third_opponent'])
possession_merge['away_touches_mid_third'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['touches_mid_third_team'], possession_merge['touches_mid_third_opponent'])
possession_merge['away_touches_att_third'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['touches_att_third_team'], possession_merge['touches_att_third_opponent'])
possession_merge['away_touches_att_pen'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['touches_att_pen_team'], possession_merge['touches_att_pen_opponent'])
possession_merge['away_touches_live'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['touches_live_team'], possession_merge['touches_live_opponent'])
possession_merge['away_dribbles_success'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['dribbles_succ_team'], possession_merge['dribbles_succ_opponent'])
possession_merge['away_dribbles_att'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['dribbles_att_team'], possession_merge['dribbles_att_opponent'])
possession_merge['away_dribbles_players_number'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['dribbles_players_num_team'], possession_merge['dribbles_players_num_opponent'])
possession_merge['away_dribbles_megs'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['dribbles_megs_team'], possession_merge['dribbles_megs_opponent'])
possession_merge['away_carries'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['carries_team'], possession_merge['carries_opponent'])
possession_merge['away_carries_tot_dist'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['carries_tot_dist_team'], possession_merge['carries_tot_dist_opponent'])
possession_merge['away_carries_prog_dist'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['carries_prog_dist_team'], possession_merge['carries_prog_dist_opponent'])
possession_merge['away_carries_prog'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['carries_prog_team'], possession_merge['carries_prog_opponent'])
possession_merge['away_carries_one_third'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['carries_one_third_team'], possession_merge['carries_one_third_opponent'])
possession_merge['away_carries_cpa'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['carries_cpa_team'], possession_merge['carries_cpa_opponent'])
possession_merge['away_carries_miss'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['carries_miss_team'], possession_merge['carries_miss_opponent'])
possession_merge['away_carries_dis'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['carries_dis_team'], possession_merge['carries_dis_opponent'])
possession_merge['away_receiving_target'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['receiving_target_team'], possession_merge['receiving_target_opponent'])
possession_merge['away_receiving_rec'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['receiving_rec_team'], possession_merge['receiving_rec_opponent'])
possession_merge['away_receiving_prog'] = np.where(possession_merge['team_team'] == possession_merge['away'], possession_merge['receiving_rec_team'], possession_merge['receiving_rec_opponent'])

possession_merge

Unnamed: 0,_date_,_time_,comp,round,_day_,venue_team,_result__team,gf_team,ga_team,opponent_team,...,away_carries_tot_dist,away_carries_prog_dist,away_carries_prog,away_carries_one_third,away_carries_cpa,away_carries_miss,away_carries_dis,away_receiving_target,away_receiving_rec,away_receiving_prog
0,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Leganés,...,1542,758,21,5,1,15,15,309,250,250
1,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Away,L,0,1,Valencia,...,2691,1439,52,15,2,15,12,499,451,451
2,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Las Palmas,...,2691,1439,52,15,2,15,12,499,451,451
3,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Home,W,1,0,Alavés,...,1542,758,21,5,1,15,15,309,250,250
4,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,Away,D,1,1,Sevilla,...,1531,755,32,17,3,5,12,401,297,297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Home,L,0,2,Villarreal,...,1158,503,14,2,4,12,5,300,254,254
3796,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Away,W,2,0,Osasuna,...,1164,595,27,9,4,14,5,270,211,211
3797,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Home,L,0,1,Cádiz,...,2204,1126,33,18,3,12,9,408,357,357
3798,2022-05-22,17:30:00,La Liga,Matchweek 38,Sun,Away,L,1,3,Elche,...,1458,752,25,7,2,16,13,366,316,316


In [9]:
# for i in possession_merge.columns:
#     print(i)

In [10]:
# Drop unnecessary columns
possession_merge = possession_merge.loc[:, ["_date_", "_time_", "comp", "round", "_day_", "home", "away", "homegoal",
                        "homepossession", "home_touches", "home_touches_def_pen", "home_touches_def_third", "home_touches_mid_third", "home_touches_att_third", "home_touches_att_pen",
                        "home_touches_live", "home_dribbles_success", "home_dribbles_att", "home_dribbles_players_number", "home_dribbles_megs", "home_carries", "home_carries_tot_dist",
                        "home_carries_prog_dist", "home_carries_prog", "home_carries_one_third", "home_carries_cpa", "home_carries_miss", "home_carries_dis", "home_receiving_target", "home_receiving_rec",
                        "home_receiving_prog",
                        "awaygoal", "awaypossession", "away_touches", "away_touches_def_pen", "away_touches_def_third", "away_touches_mid_third", "away_touches_att_third", "away_touches_att_pen",
                        "away_touches_live", "away_dribbles_success", "away_dribbles_att", "away_dribbles_players_number", "away_dribbles_megs", "away_carries", "away_carries_tot_dist",
                        "away_carries_prog_dist", "away_carries_prog", "away_carries_one_third", "away_carries_cpa", "away_carries_miss", "away_carries_dis", "away_receiving_target", "away_receiving_rec",
                        "away_receiving_prog"]]
possession_merge

Unnamed: 0,_date_,_time_,comp,round,_day_,home,away,homegoal,homepossession,home_touches,...,away_carries_tot_dist,away_carries_prog_dist,away_carries_prog,away_carries_one_third,away_carries_cpa,away_carries_miss,away_carries_dis,away_receiving_target,away_receiving_rec,away_receiving_prog
0,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Leganés,Alavés,1,54,566,...,1542,758,21,5,1,15,15,309,250,250
1,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Valencia,Las Palmas,1,48,597,...,2691,1439,52,15,2,15,12,499,451,451
2,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Valencia,Las Palmas,1,48,597,...,2691,1439,52,15,2,15,12,499,451,451
3,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Leganés,Alavés,1,54,566,...,1542,758,21,5,1,15,15,309,250,250
4,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,Sevilla,Espanyol,1,62,787,...,1531,755,32,17,3,5,12,401,297,297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Barcelona,Villarreal,0,70,815,...,1158,503,14,2,4,12,5,300,254,254
3796,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Osasuna,Mallorca,0,62,616,...,1164,595,27,9,4,14,5,270,211,211
3797,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Alavés,Cádiz,0,49,563,...,2204,1126,33,18,3,12,9,408,357,357
3798,2022-05-22,17:30:00,La Liga,Matchweek 38,Sun,Elche,Getafe,3,59,654,...,1458,752,25,7,2,16,13,366,316,316


In [11]:
possession_merge_final = possession_merge.drop_duplicates()
possession_merge_final.sort_values(['_date_', '_time_'])

Unnamed: 0,_date_,_time_,comp,round,_day_,home,away,homegoal,homepossession,home_touches,...,away_carries_tot_dist,away_carries_prog_dist,away_carries_prog,away_carries_one_third,away_carries_cpa,away_carries_miss,away_carries_dis,away_receiving_target,away_receiving_rec,away_receiving_prog
0,2017-08-18,20:15:00,La Liga,Matchweek 1,Fri,Leganés,Alavés,1,54,566,...,1542,758,21,5,1,15,15,309,250,250
1,2017-08-18,22:15:00,La Liga,Matchweek 1,Fri,Valencia,Las Palmas,1,48,597,...,2691,1439,52,15,2,15,12,499,451,451
5,2017-08-19,18:15:00,La Liga,Matchweek 1,Sat,Celta Vigo,Real Sociedad,2,53,591,...,1896,1013,30,10,3,14,7,419,352,352
7,2017-08-19,20:15:00,La Liga,Matchweek 1,Sat,Girona,Atlético Madrid,2,53,619,...,1892,1169,44,15,3,5,8,406,347,347
4,2017-08-19,22:15:00,La Liga,Matchweek 1,Sat,Sevilla,Espanyol,1,62,787,...,1531,755,32,17,3,5,12,401,297,297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3790,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Alavés,Cádiz,0,49,563,...,2204,1126,33,18,3,12,9,408,357,357
3792,2022-05-22,20:00:00,La Liga,Matchweek 38,Sun,Osasuna,Mallorca,0,62,616,...,1164,595,27,9,4,14,5,270,211,211
3786,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Real Sociedad,Atlético Madrid,1,61,741,...,1390,530,20,6,0,19,11,363,281,281
3788,2022-05-22,22:00:00,La Liga,Matchweek 38,Sun,Sevilla,Athletic Club,1,65,827,...,1287,722,28,14,5,11,10,307,292,292


In [12]:
possession_merge_final.groupby(possession_merge_final['_date_'].dt.year).agg({'count'})['_date_']

Unnamed: 0_level_0,count
_date_,Unnamed: 1_level_1
2017,169
2018,380
2019,391
2020,355
2021,407
2022,198


In [13]:
possession_merge_final.to_pickle(os.path.join(os.path.realpath('../data/preprocessed/'), 'possession_merge.pkl'))