In [7]:
import pandas as pd
import os
import sys
import glob
from typing import Tuple, Optional
from datetime import datetime
import re


In [8]:
#----------- Configuration--------------------
# Get current date for file naming.
now_date = datetime.now()


MASTER_EVENTS_DIR = "../Data/Master/Events"
MASTER_EVENTS_SUFFIX = "_master_events.csv"
MASTER_EVENTS_REGEX = rf"^\d{{8}}{re.escape('_')}{re.escape(MASTER_EVENTS_SUFFIX)}$"


INTERMEDIATE_EVENTS_DIR= "../Data/Processed/Events/Intermediate/"
INTERMEDIATE_EVENTS_SUFFIX = "_intermediate_events.csv"
INTERMEDIATE_EVENTS_REGEX = rf"^\d{{8}}{re.escape(INTERMEDIATE_EVENTS_SUFFIX)}$"


# Manual made table sponsor information 
EVENTS_SPONSORS_DIR = "../Data/Processed/Sponsors"
EVENTS_SPONSORS_SUFFIX = "_event_sponsors.csv"
EVENTS_SPONSORS_REGEX = rf"^\d{{8}}{re.escape(EVENTS_SPONSORS_SUFFIX)}$"


#used to return dummy /placeholder Df if no existing data is found 
# currently just used to check if df is empty: can be used 
MINIMAL_EVENT_COLUMNS = ["eventId"]





In [9]:
def get_latest_master_events(master_dir:str, master_regex) -> Tuple[pd.DataFrame,Optional[str]]:
    """
    Parses specified directory for events files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the master files are stored (e.g., '../Data/Events/Intermediate').
        filename_pattern (str): The pattern to match (e.g., '*_events_intermediate.csv').

    Returns:
        Tuple[pd.DataFrame,Optional]: returns DF with data if available or blank df if data unavailable
    """
    if not os.path.isdir(master_dir):
        print (f"‚ùå{master_dir} does not exist as a directory")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None    
    
    # Get csv files in 
    files = glob.glob(f"{master_dir}/*.csv")
   

    master_files = []
    
    if not files:
        print(f"‚ùå No existing *.csv files found in MASTER Events Directory: {master_dir} ")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None 

    for file in files:
        filename = os.path.basename(file)    
       
        if re.match(master_regex,filename):
            master_files.append(file)

    if not master_files:
        print(f"‚ùå No existing MASTER files in format: {master_regex} in {master_dir}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None 
    master_files.sort()    
    latest_master = master_files[-1]

    try: 
        latest_master_df = pd.read_csv(latest_master)
        print(f"‚úÖ {len(latest_master_df)} events found in latest MASTER: {latest_master} ")
        return latest_master_df, latest_master
        
    except Exception as e:
        print (f"‚ùå Error reading lastest MASTER, {latest_master}: {e}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None 

In [10]:
def get_latest_intermediate_events(intermediate_dir:str, intermediate_regex) -> Optional[pd.DataFrame]:
    """
    Parses specified directory for events files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the intermediate files are stored (e.g., '../Data/Events/Intermediate').
        filename_pattern (str): The pattern to match (e.g., '*_events_intermediate.csv').

    Returns:
        Optional[pd.DataFrame]: The DataFrame of the latest file, or None if no files are found or reading fails.
    """
    if not os.path.isdir(intermediate_dir):
        print (f"‚ùå{intermediate_dir} does not exist as a directory")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None   
    
    # Get csv files in 
    files = glob.glob(f"{intermediate_dir}/*.csv")
   

    intermediate_files = []
    
    if not files:
        print(f"‚ùå No existing *.csv files found in INTERMEDIATE Events Directory: {intermediate_dir} ")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None

    for file in files:
        filename = os.path.basename(file)
       
        if re.match(intermediate_regex,filename):
           intermediate_files.append(file)

    if not intermediate_files:
        print(f"‚ùå No existing INTERMEDIATE files in format: {intermediate_regex} in {intermediate_dir}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS),None 
    intermediate_files.sort()    
    latest_intermediate = intermediate_files[-1]

    try: 
        latest_intermediate_df = pd.read_csv(latest_intermediate)
        print(f"‚úÖ {len(latest_intermediate_df)} events found in latest INTERMEDIATE: {latest_intermediate} ")
        return latest_intermediate_df, latest_intermediate
        
    except Exception as e:
        print (f"‚ùå Error reading lastest INTERMEDIATE, {latest_intermediate}: {e}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None 

In [11]:
def get_latest_event_sponsor(event_sponsor_dir:str, event_sponsor_regex) -> Optional[pd.DataFrame]:
    """
    Parses specified directory for events files in format yyyy_mm_dd. 
    Attempts to read latest file in this format. 

    Args:
        directory (str): The folder where the event_sponsor files are stored (e.g., '../Data/Events/Intermediate').
        filename_pattern (str): The pattern to match (e.g., '*_event_sponsor.csv').

    Returns:
        Optional[pd.DataFrame]: The DataFrame of the latest file, or None if no files are found or reading fails.
    """
    if not os.path.isdir(event_sponsor_dir):
        print (f"‚ùå{event_sponsor_dir} does not exist as a directory")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS)   
    
    # Get csv files in 
    files = glob.glob(f"{event_sponsor_dir}/*.csv")
   

    event_sponsor_files = []
    
    if not files:
        print(f"‚ùå No existing *.csv files found in EVENTS_SPONSORS  Directory: {event_sponsor_dir} ")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None  

    for file in files:
        filename = os.path.basename(file)
       
        if re.match(event_sponsor_regex,filename):
           event_sponsor_files.append(file)

    if not event_sponsor_files:
        print(f"‚ùå No existing EVENTS_SPONSORS files in format: {event_sponsor_regex} in {event_sponsor_dir}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS)   
    event_sponsor_files.sort()    
    latest_event_sponsor = event_sponsor_files[-1]

    try: 
        latest_event_sponsor_df = pd.read_csv(latest_event_sponsor)
        print(f"‚úÖ {len(latest_event_sponsor_df)} events found in latest EVENTS_SPONSORS: {latest_event_sponsor} ")
        return latest_event_sponsor_df, latest_event_sponsor
        
    except Exception as e:
        print (f"‚ùå Error reading lastest EVENTS_SPONSORS, {file}: {e}")
        return pd.DataFrame(columns=MINIMAL_EVENT_COLUMNS), None

In [12]:
if __name__ == "__main__":
    

    latest_master_df, latest_master_file = get_latest_master_events(MASTER_EVENTS_DIR, MASTER_EVENTS_REGEX)
    if (not latest_master_df.empty) & bool(latest_master_file):   
        latest_master_date_str = os.path.basename(latest_master_file).split("_")[0]
        latest_master_date = pd.to_datetime(latest_master_date_str)

    
    latest_intermediate_df, latest_intermediate_file = get_latest_intermediate_events(INTERMEDIATE_EVENTS_DIR, INTERMEDIATE_EVENTS_REGEX)
    if (not latest_intermediate_df.empty) and bool(latest_intermediate_file):
        latest_intermediate_date_str = os.path.basename(latest_intermediate_file).split("_")[0]
        latest_intermediate_date = pd.to_datetime(latest_intermediate_date_str)
    else:
        sys.exit(1)
        print("‚ùå No intermdiate nor master events file found ")
        
    latest_sponsors_df, latest_sponsors_file = get_latest_event_sponsor(EVENTS_SPONSORS_DIR,EVENTS_SPONSORS_REGEX)
    if (not latest_sponsors_df.empty) and bool(latest_sponsors_file):
        latest_sponsors_date_str = os.path.basename(latest_sponsors_file).split("_")[0]
        latest_sponsors_date = pd.to_datetime(latest_sponsors_date_str)
    else:
        sys.exit(1)
        print("‚ùå No Sponsors Data file found ")
    
    if (not latest_master_df.empty):
        if (latest_master_date > latest_intermediate_date) & (latest_master_date > latest_sponsors_date):
            print(f"Latest Master is already updated")
           
        else:
            print("--- üü¢ Enriching Intermdiate file with sponsors üü¢---")
            enriched_df = enriched_df = pd.merge(how="left",
                left=latest_intermediate_df,
                right=latest_sponsors_df,
                on="eventId")
                
            try:     
                now_date = datetime.now()
                date_string = now_date.strftime("%Y%m%d")
                new_master_name = f"{date_string}_{MASTER_EVENTS_SUFFIX}"
                new_master_path = os.path.join(MASTER_EVENTS_DIR,new_master_name)
                enriched_df.to_csv(new_master_path)

                enriched_df.to_csv(new_master_path, index=False)
                print(f"--- üü¢ Successfully saved {len(enriched_df)} total events to {new_master_path} üü¢---")
            except Exception as e:
                print(f"--- ‚ùå FAILED to new Mater Eventss File  {e} ---")
   


    

    
    

‚úÖ 184 events found in latest MASTER: ../Data/Master/Events/20251101__master_events.csv 
‚úÖ 184 events found in latest INTERMEDIATE: ../Data/Processed/Events/Intermediate/20251101_intermediate_events.csv 
‚úÖ 184 events found in latest EVENTS_SPONSORS: ../Data/Processed/Sponsors/20251030_event_sponsors.csv 
--- üü¢ Enriching Intermdiate file with sponsors üü¢---
--- üü¢ Successfully saved 184 total events to ../Data/Master/Events/20251101__master_events.csv üü¢---
