In [1]:
import os
import sys
sys.path.append('../')
os.chdir('../')

In [2]:
import duckdb 
import pandas as pd
import duck_db_helper

In [3]:
conn = duckdb.connect('../data/exploitation_zone/exploitation_zone.db')
tables = duck_db_helper.get_tables(conn)
df_list = []
df_list_names = []
for table_name in tables: 
    df = conn.sql(f"SELECT * FROM \"{table_name}\";").df()
    df_list.append(df)
    df_list_names.append(table_name)
conn.close()

In [4]:
dff = df_list[0] # Matches
dfp = df_list[1] # Players  

# Join the matches and football players tables
Now we merge the matches data into each football player statistics on that match.

In [5]:
dfr_home = pd.merge(dfp, dff, left_on=['team_x', 'match_date'], right_on=['HomeTeam', 'Date'], how='inner')
dfr_away = pd.merge(dfp, dff, left_on=['team_x', 'match_date'], right_on=['AwayTeam', 'Date'], how='inner')

In [6]:
dfr = pd.concat([dfr_home, dfr_away], ignore_index=True)

# Project only the wanted columns for the analysis

First we filter all unused columns that are redundant or don't have any real value for analysis. We also remove all the betting odds and just keep an average (that was calculated in previous steps).

In [7]:
unused_cols = ['match_date', 'fixture', 'kickoff_time','transfers_balance','transfers_in','transfers_out',
               'Div','Date','Time','HomeTeam', 'AwayTeam', 'HTHG', 'HTAG', 'HTR', 'Referee',
               'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR',
                ]

# Check if they exists before droping
unused_cols = [col for col in unused_cols if col in dfr.columns]
dfr = dfr.drop(unused_cols, axis=1)

unused_betting_odds = ['B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 
                        'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'MaxH', 'MaxD', 'MaxA',
                        'B365>2.5', 'B365<2.5', 'P>2.5', 'P<2.5', 'Max>2.5', 'Max<2.5', 
                        'Avg>2.5', 'Avg<2.5', 'AHh', 'B365AHH', 'B365AHA', 'PAHH', 'PAHA', 
                        'MaxAHH', 'MaxAHA', 'AvgAHH', 'AvgAHA', 'B365CH', 'B365CD', 'B365CA', 
                        'BWCH', 'BWCD', 'BWCA', 'IWCH', 'IWCD', 'IWCA', 'PSCH', 'PSCD', 'PSCA', 
                        'WHCH', 'WHCD', 'WHCA', 'VCCH', 'VCCD', 'VCCA', 'MaxCH', 'MaxCD', 'MaxCA', 
                        'AvgCH', 'AvgCD', 'AvgCA', 'B365C>2.5', 'B365C<2.5', 'PC>2.5', 'PC<2.5', 
                        'MaxC>2.5', 'MaxC<2.5', 'AvgC>2.5', 'AvgC<2.5', 'AHCh', 'B365CAHH', 'B365CAHA', 
                        'PCAHH', 'PCAHA', 'MaxCAHH', 'MaxCAHA', 'AvgCAHH', 'AvgCAHA'
                        ]
unused_betting_odds = [col for col in unused_betting_odds if col in dfr.columns]
dfr = dfr.drop(unused_betting_odds, axis=1)


# Select the valid players

We are going to only select the players that have played at least 10 minutes in 10 matches for this analysis.

In [8]:
dfr_10 = dfr[dfr['minutes'] >= 10]
dfr_10

Unnamed: 0,team_x,opp_team_name,was_home,name,position,assists,bonus,bps,clean_sheets,creativity,element,goals_conceded,goals_scored,ict_index,influence,minutes,own_goals,penalties_missed,penalties_saved,yellow_cards,red_cards,round,saves,selected,threat,total_points,value,FTHG,FTAG,FTR,AvgH,AvgD,AvgA,PRESS,WDIR,WSPD,CLOUD,TEMP,TDEW
1,Southampton,Leeds,1,Che Adams,FWD,0,0,6,0,13.5,411,2,0,2.2,5.2,90,0,0,0,0,0,2,0,53680,3,2,64,2,2,D,2.32,3.51,3.10,1015.6,NW,3.0,0.0,16.4,11.1
3,Southampton,Leeds,1,Roméo Lavia,MID,0,0,10,0,11.2,321,2,0,2.1,9.6,90,0,0,0,0,0,2,0,35569,0,2,45,2,2,D,2.32,3.51,3.10,1015.6,NW,3.0,0.0,16.4,11.1
4,Southampton,Leeds,1,Joe Ayodele-Aribo,MID,0,1,23,0,2.1,512,0,1,9.5,39.2,29,0,0,0,0,0,2,0,54303,54,7,55,2,2,D,2.32,3.51,3.10,1015.6,NW,3.0,0.0,16.4,11.1
5,Southampton,Leeds,1,Stuart Armstrong,MID,0,0,5,0,17.4,405,2,0,4.2,1.4,60,0,0,0,0,0,2,0,10063,23,2,50,2,2,D,2.32,3.51,3.10,1015.6,NW,3.0,0.0,16.4,11.1
7,Southampton,Leeds,1,Mohammed Salisu,DEF,0,0,6,0,0.7,420,2,0,2.0,19.6,90,0,0,0,1,0,2,0,14622,0,0,45,2,2,D,2.32,3.51,3.10,1015.6,NW,3.0,0.0,16.4,11.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16164,Man City,Brentford,0,Aymeric Laporte,DEF,0,0,17,0,0.0,309,1,0,0.0,0.0,90,0,0,0,0,0,38,0,63072,0,2,56,1,0,H,3.88,4.01,1.87,1023.4,NE,3.0,0.0,12.1,8.1
16169,Man City,Brentford,0,Rico Lewis,DEF,0,0,11,0,0.0,573,1,0,0.0,0.0,90,0,0,0,0,0,38,0,132924,0,2,38,1,0,H,3.88,4.01,1.87,1023.4,NE,3.0,0.0,12.1,8.1
16174,Man City,Brentford,0,Cole Palmer,MID,0,0,11,0,0.0,316,1,0,0.0,0.0,90,0,0,0,0,0,38,0,17425,0,2,42,1,0,H,3.88,4.01,1.87,1023.4,NE,3.0,0.0,12.1,8.1
16177,Man City,Brentford,0,Riyad Mahrez,MID,0,0,8,0,0.0,303,1,0,0.0,0.0,90,0,0,0,0,0,38,0,1014626,0,2,75,1,0,H,3.88,4.01,1.87,1023.4,NE,3.0,0.0,12.1,8.1


In [10]:
name_counts = dfr_10.groupby('name').size()
at_least_10_games = name_counts[name_counts >= 10]

In [11]:
wanted_players = at_least_10_games.index.to_list()
dfr_only_wanted_players = dfr[dfr['name'].isin(wanted_players)]

In [12]:
analytical_sandbox_db = '../data/analytical_sandboxes/analytical_sandbox_zone.db'
conn = duckdb.connect(analytical_sandbox_db)

duck_db_helper.create_table('football_matches', dfr_only_wanted_players, conn)
conn.close()