In [4]:
# packages
import pandas as pd
import os
import re
import sqlite3

In [None]:
formatted_zone_db = '../data/formatted_zone/formatted_zone.db'
trusted_zone_db = '../data/trusted_zone/trusted_zone.db'

In [19]:
# Gets the file with the last date, 
# ex: for filename=cleaned_merged_seasons would return cleaned_merged_seasons_2023-09-22.csv
from datetime import datetime
def get_last_table(table_names, fileformat="csv"):
    format_str = "%Y-%m-%d"  # Date format
    most_recent_file = max(
        table_names, 
        key=lambda f: datetime.strptime(f[-len("yyyy-MM-dd"):], format_str)
    )
    return most_recent_file

In [20]:
# get all the tables in the formatted zone
conn = sqlite3.connect(formatted_zone_db)
c = conn.cursor()
c.execute("SELECT name FROM sqlite_master WHERE type='table';")
formatted_zone_tables = c.fetchall()
formatted_zone_tables = [t[0] for t in formatted_zone_tables]
conn.close()

In [22]:
trusted_zone_table = 'cleaned_merged_seasons'
cleaned_merged_seasons_tables = filter(lambda x: x.startswith(trusted_zone_table), formatted_zone_tables)
# the data from all seasons is all in one table so we only need to find the latest version of the table
latest_table_name = get_last_table(cleaned_merged_seasons_tables)
conn = sqlite3.connect(formatted_zone_db)
df = pd.read_sql_table(latest_table_name, conn)
conn.close()
# do some data quality checks, i think?
...


df.to_sql(trusted_zone_table, conn, if_exists='replace')

cleaned_merged_seasons_2023-10-13


In [24]:
trusted_zone_table = 'master_team_list'
master_team_list_tables = filter(lambda x: x.startswith(trusted_zone_table), formatted_zone_tables)
# same with master_team_list
latest_table_name = get_last_table(master_team_list_tables)
conn = sqlite3.connect(trusted_zone_db)
df = pd.read_sql_table(latest_table_name, conn)
conn.close()
# do some data quality checks, i think?
...


df.to_sql(trusted_zone_table, conn, if_exists='replace')

master_team_list_2023-10-13


In [55]:
trusted_zone_table = 'football-data'
football_data_tables = list(filter(lambda x: x.startswith(trusted_zone_table), formatted_zone_tables))
# TEST football_data_tables = ['football-data_2223_2024-01-01', 'football-data_2223_2023-12-31', 'football-data_2223_2024-02-16', 'football-data_2324_2025-02-03', 'football-data_2324_2025-06-07']
# the football-data has a table per season, denoted by football-data_yyYY
# group all tables for the same season 
unique_seasons = list(set([t[:-len("_yyyy-MM-dd")] for t in football_data_tables]))
# drop table first
...
trusted_zone_conn = sqlite3.connect(trusted_zone_db)
c = trusted_zone_conn.cursor()
c.execute(f'DROP TABLE IF EXISTS {trusted_zone_table}')

for season in unique_seasons: 
    season_tables = filter(lambda x: x.startswith(season), football_data_tables)
    # pick the newest table for the season
    latest_table_name = get_last_table(season_tables)
    
    conn = sqlite3.connect(formatted_zone_db)
    df = pd.read_sql_table(latest_table_name, conn)
    conn.close()
    # connect to trusted zone
    df.to_sql('football-data', trusted_zone_conn, if_exists='append')
    # append all selected tables to a trusted zone df 

trusted_zone_conn.close()

football-data_2223_2023-10-13


In [56]:
trusted_zone_table = 'Metoffice'
metoffice_tables = list(filter(lambda x: x.startswith(trusted_zone_table), formatted_zone_tables))
# group all tables for the same month
unique_met_months = list(set([t[:-len("_yyyy-MM-dd")] for t in metoffice_tables]))


trusted_zone_conn = sqlite3.connect(trusted_zone_db)
c = trusted_zone_conn.cursor()
c.execute(f'DROP TABLE IF EXISTS {trusted_zone_table}')

for month in unique_met_months: 
    month_tables = filter(lambda x: x.startswith(month), metoffice_tables)

    # pick the newest table for the season
    latest_table_name = get_last_table(month_tables)
    
    conn = sqlite3.connect(formatted_zone_db)
    df = pd.read_sql_table(latest_table_name, conn)
    conn.close()
    # append all selected tables to a trusted zone df 
    
    df.to_sql(trusted_zone_table, trusted_zone_conn, if_exists='append')

trusted_zone_conn.close()

Metoffice_02_22_2023-10-13
Metoffice_03_23_2023-10-13
Metoffice_12_22_2023-10-13
Metoffice_05_23_2023-10-13
Metoffice_06_22_2023-10-13
Metoffice_08_22_2023-10-13
Metoffice_08_23_2023-10-13
Metoffice_04_23_2023-10-13
Metoffice_09_22_2023-10-13
Metoffice_01_23_2023-10-13
Metoffice_01_22_2023-10-13
Metoffice_03_22_2023-10-13
Metoffice_04_22_2023-10-13
Metoffice_07_23_2023-10-13
Metoffice_11_22_2023-10-13
Metoffice_05_22_2023-10-13
Metoffice_10_22_2023-10-13
Metoffice_06_23_2023-10-13
Metoffice_07_22_2023-10-13
Metoffice_02_23_2023-10-13
