In [None]:
import sqlite3
import pandas as pd
import os
from src.data_preprocessing import get_data_from_db_or_csv  # Assuming the function is in this file

In [None]:
# Connect to a database file instead of an in-memory database
conn = sqlite3.connect('/data/raw/my_database.db')  # This will create or open a file `my_database.db` in the `/data/` directory
cursor = conn.cursor()

In [None]:
# The base directory where the CSV files are located
base_csv_dir = 'data/csv/'

# Initialize an empty list to hold all dataframes
all_dfs = []

# Loop to load CSV files
idx = 1
while True:
    file_name = os.path.join(base_csv_dir, f"person.{idx}.csv")
    if not os.path.exists(file_name):
        break  # Exit loop if file does not exist
    
    # Retrieve data using our custom function
    # This function will try to get data from the database first,
    # and if not found, will fall back to the CSV file
    df = get_data_from_db_or_csv(person_id=idx, conn=conn, csv_filename=file_name)
    
    # Append the dataframe to the list
    all_dfs.append(df)
    
    idx += 1

# Concatenate all loaded DataFrames into one
all_persons_df = pd.concat(all_dfs, ignore_index=True)

In [None]:


# Insert the concatenated DataFrame into the 'all_persons' table in the database
all_persons_df.to_sql('all_persons', conn, index=False)

# Note: We are not closing the connection here as we want to use it later in the notebook.


In [None]:
# If all data is utilised we can close connection
# Close the database connection
conn.close()