In [1]:
import os
import sys
sys.path.append('../')
os.chdir('../')

In [2]:
import duckdb 
import pandas as pd
import duck_db_helper

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
conn = duckdb.connect('../data/exploitation_zone/exploitation_zone.db')
tables = duck_db_helper.get_tables(conn)
df_list = []
df_list_names = []
for table_name in tables: 
    df = conn.sql(f"SELECT * FROM \"{table_name}\";").df()
    df_list.append(df)
    df_list_names.append(table_name)
conn.close()

In [4]:
df_list_names

['football_matches', 'players']

In [5]:
dfm = df_list[0] # Matches
dfp = df_list[1] # Players  

# Project only the wanted columns for the analysis

First we filter all unused columns that are or redundant or don't have any real value for analysis. We also remove all the betting odds and just keep an average (that was calculated in previous steps). 

## Players Projections
We will get rid of some of the stats like fixture, kick off time and transfers

In [6]:
unused_players_cols = [ 'fixture', 'kickoff_time','transfers_balance','transfers_in','transfers_out']

# Check if they exists before droping
unused_players_cols = [col for col in unused_players_cols if col in dfp.columns]
dfp = dfp.drop(unused_players_cols, axis=1)

## Matches Projections
Here we will get rid of some other statics and some redundant betting odds data. 

In [7]:
unused_matches_cols = [ 'Div','Time', 'HTHG', 'HTAG', 'HTR', 'Referee',
                           'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR'
                        ]
unused_matches_cols = [col for col in unused_matches_cols if col in dfm.columns]
dfm = dfm.drop(unused_matches_cols, axis=1)


unused_matches_betting_odds = ['B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 
                        'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'MaxH', 'MaxD', 'MaxA',
                        'B365>2.5', 'B365<2.5', 'P>2.5', 'P<2.5', 'Max>2.5', 'Max<2.5', 
                        'Avg>2.5', 'Avg<2.5', 'AHh', 'B365AHH', 'B365AHA', 'PAHH', 'PAHA', 
                        'MaxAHH', 'MaxAHA', 'AvgAHH', 'AvgAHA', 'B365CH', 'B365CD', 'B365CA', 
                        'BWCH', 'BWCD', 'BWCA', 'IWCH', 'IWCD', 'IWCA', 'PSCH', 'PSCD', 'PSCA', 
                        'WHCH', 'WHCD', 'WHCA', 'VCCH', 'VCCD', 'VCCA', 'MaxCH', 'MaxCD', 'MaxCA', 
                        'AvgCH', 'AvgCD', 'AvgCA', 'B365C>2.5', 'B365C<2.5', 'PC>2.5', 'PC<2.5', 
                        'MaxC>2.5', 'MaxC<2.5', 'AvgC>2.5', 'AvgC<2.5', 'AHCh', 'B365CAHH', 'B365CAHA', 
                        'PCAHH', 'PCAHA', 'MaxCAHH', 'MaxCAHA', 'AvgCAHH', 'AvgCAHA'
                        ]
unused_matches_betting_odds = [col for col in unused_matches_betting_odds if col in dfm.columns]
dfm = dfm.drop(unused_matches_betting_odds, axis=1)

# Selections of the only wanted rows for the analysis

## Selections on Players

### Players active in the last two years (2022-2023)

First we are only going to select the entries beloning of the last years as it's the only data we have on in the matches table.


In [8]:
start = pd.to_datetime(min(dfm['Date']))
end = pd.to_datetime(max(dfm['Date']))

print(f"Between {start} and {end}")

dfp = dfp[(dfp['match_date'] >= start) & (dfp['match_date'] <= end)]

Between 2022-01-09 00:00:00 and 2023-12-03 00:00:00


### Players that played at least 10 minutes in 10 different matches

We are going to only select the players that have played at least 10 minutes in 10 matches for this analysis. As players who played less than will probably have less robust statistics. 

In [9]:
dfp_10 = dfp[dfp['minutes'] >= 10]

In [10]:
name_counts = dfp_10.groupby('name').size()
at_least_10_games = name_counts[name_counts >= 10]
wanted_players = at_least_10_games.index.to_list()
dfp = dfp[dfp['name'].isin(wanted_players)]

# Saving data
We are going to save the data in in DuckDB.

In [11]:
analytical_sandbox_db = '../data/analytical_sandboxes/analytical_sandbox_zone.db'

conn = duckdb.connect(analytical_sandbox_db)
duck_db_helper.create_table('matches', dfm, conn)
duck_db_helper.create_table('players', dfp, conn)
conn.close()