In [1]:
# packages
import pandas as pd
import os
import re
import duckdb 

In [2]:
formatted_zone_db = '../data/formatted_zone/formatted_zone.db'
trusted_zone_db = '../data/trusted_zone/trusted_zone.db'

In [34]:
def get_tables(conn):
    tables_lists = conn.sql("SHOW TABLES").fetchall()
    return [t[0] for t in tables_lists]

def table_exists(table_name, conn):
    return table_name in get_tables(conn)

def get_table_df(table_name, conn):
    return conn.sql(f"SELECT * FROM \"{table_name}\";").df()

def drop_table(table_name, conn):
    if table_exists(table_name, conn):
        conn.sql(f"DROP TABLE \"{table_name}\"")
    
def create_table(table_name, df, conn, replace=True):
    if replace & table_exists(table_name, conn): 
        drop_table(table_name, conn)
    conn.sql(f"CREATE TABLE \"{table_name}\" AS SELECT * FROM df")

def append_table(table_name, df, conn):
    conn.sql(f"INSERT INTO \"{table_name}\" SELECT * FROM df")

In [4]:
# Gets the file with the last date, 
# ex: for filename=cleaned_merged_seasons would return cleaned_merged_seasons_2023-09-22.csv
from datetime import datetime
def get_last_table(table_names, fileformat="csv"):
    format_str = "%Y-%m-%d"  # Date format
    most_recent_file = max(
        table_names, 
        key=lambda f: datetime.strptime(f[-len("yyyy-MM-dd"):], format_str)
    )
    return most_recent_file

In [5]:
# get all the tables in the formatted zone
conn = duckdb.connect(formatted_zone_db)
formatted_zone_tables = get_tables(conn)

In [29]:
trusted_zone_table = 'cleaned_merged_seasons'
cleaned_merged_seasons_tables = filter(lambda x: x.startswith(trusted_zone_table), formatted_zone_tables)
# the data from all seasons is all in one table so we only need to find the latest version of the table
latest_table_name = get_last_table(cleaned_merged_seasons_tables)
conn = duckdb.connect(formatted_zone_db)
df = get_table_df(latest_table_name, conn)
conn.close()
# do some data quality checks, i think?

conn = duckdb.connect(trusted_zone_db)
create_table(trusted_zone_table, df, conn)

In [30]:
trusted_zone_table = 'master_team_list'
master_team_list_tables = filter(lambda x: x.startswith(trusted_zone_table), formatted_zone_tables)
# same with master_team_list
latest_table_name = get_last_table(master_team_list_tables)
conn = duckdb.connect(formatted_zone_db)
df = get_table_df(latest_table_name, conn)
conn.close()
# do some data quality checks, i think?

conn = duckdb.connect(trusted_zone_db)
create_table(trusted_zone_table, df, conn)

In [33]:
conn = duckdb.connect(trusted_zone_db)
get_tables(conn)

['cleaned_merged_seasons', 'master_team_list']

In [36]:
trusted_zone_table = 'football-data'
football_data_tables = list(filter(lambda x: x.startswith(trusted_zone_table), formatted_zone_tables))
# TEST football_data_tables = ['football-data_2223_2024-01-01', 'football-data_2223_2023-12-31', 'football-data_2223_2024-02-16', 'football-data_2324_2025-02-03', 'football-data_2324_2025-06-07']
# the football-data has a table per season, denoted by football-data_yyYY
# group all tables for the same season 
unique_seasons = list(set([t[:-len("_yyyy-MM-dd")] for t in football_data_tables]))

# drop table first
trusted_conn = duckdb.connect(trusted_zone_db)
drop_table(trusted_zone_table, trusted_conn)

for season in unique_seasons: 
    season_tables = filter(lambda x: x.startswith(season), football_data_tables)
    # pick the newest table for the season
    latest_table_name = get_last_table(season_tables)
    
    # retrieve the formatted zone df
    formatted_conn = duckdb.connect(formatted_zone_db)
    df = get_table_df(latest_table_name, formatted_conn)
    formatted_conn.close()

    # append all selected tables to a trusted zone df 
    if table_exists(trusted_zone_table, trusted_conn):
        append_table(trusted_zone_table, df, trusted_conn)
    else:
        create_table(trusted_zone_table, df, trusted_conn)

trusted_conn.close()

In [None]:
trusted_zone_table = 'Metoffice'
metoffice_tables = list(filter(lambda x: x.startswith(trusted_zone_table), formatted_zone_tables))
# group all tables for the same month
unique_met_months = list(set([t[:-len("_yyyy-MM-dd")] for t in metoffice_tables]))


trusted_conn = duckdb.connect(trusted_zone_db)
drop_table(trusted_zone_table, trusted_conn)

for month in unique_met_months: 
    month_tables = filter(lambda x: x.startswith(month), metoffice_tables)

    # pick the newest table for the season
    latest_table_name = get_last_table(month_tables)
    
    formated_conn = duckdb.connect(formatted_zone_db)
    df = get_table_df(latest_table_name,formated_conn)
    formated_conn.close()
    # append all selected tables to a trusted zone df 
    if table_exists(trusted_zone_table, trusted_conn):
        append_table(trusted_zone_table, df, trusted_conn)
    else:
        create_table(trusted_zone_table, df, trusted_conn)
    
    df.to_sql(trusted_zone_table, trusted_zone_conn, if_exists='append')

trusted_zone_conn.close()