In [1]:
import pandas as pd
from sqlalchemy import create_engine
import json

In [2]:
def read_excel_to_dataframe(file_path, worksheet_name):
    """
    Reads the specified worksheet from an Excel file and returns a Pandas data frame.

    Arguments:
        file_path (str): The path to the Excel file.
        worksheet_name (str): The name of the worksheet to read.

    Returns:
        pandas.DataFrame: The data frame containing the data from the specified worksheet.
    """
    # Read the Excel file
    xls = pd.ExcelFile(file_path)

    # Read the specified worksheet into a data frame
    df = pd.read_excel(xls, worksheet_name)

    return df


In [3]:
# Convert string columns to lowercase
# to make future where clauses easier
def convert_all_strings_to_lc(df):
    string_cols = df.select_dtypes(include=['object']).columns
    df[string_cols] = df[string_cols].apply(lambda x: x.str.lower() if x.name in string_cols else x)
    return df


In [67]:
# convert string field of yyyy-mm-dd to datetime field.
def convert_date_column_to_datetime(table_name, column_name, db_name="timelycare"):

    from sqlalchemy import create_engine, MetaData, Table, Column, Date, String

    engine = create_engine(f'sqlite:///{db_name}.db', echo=False)
    conn = engine.connect()

    # Rename the original table
    conn.execute(f"ALTER TABLE {table_name} RENAME TO {table_name}_old")

    # Create a new table with the same schema, but with the date column as a date type
    metadata = MetaData()
    old_table = Table(f"{table_name}_old", metadata, autoload=True, autoload_with=engine)
    new_table = Table(table_name, metadata,
        *[Column(c.name.lower(), Date()) if c.name == col_name else Column(c.name.lower(), String()) for c in old_table.columns],
    )
    new_table.create(engine)

    # Copy the data from the old table to the new table, transforming the date column
    select_stmt = old_table.select()
    insert_stmt = new_table.insert().from_select(
        [c.name.lower() for c in new_table.columns], select_stmt
    )
    insert_stmt = insert_stmt.on_conflict_do_nothing()
    conn.execute(insert_stmt)

    # Drop the old table
    conn.execute(f"DROP TABLE {table_name}_old")

    # Close the connection
    conn.close()
    

In [68]:
def parse_upload_json_format(i, df, table_name, engine, json_columns):
    if json_columns is not None and json_columns[i] is not None:
        for json_col in json_columns:
            if json_col in df.columns:
                
                # Parse the JSON data and store it in a new data frame
                json_data = df[json_col].apply(json.loads)
                json_df = pd.json_normalize(json_data.explode())

                # Write the JSON data frame to a new table
                json_table_name = json_col
                json_df.to_sql(json_table_name, con=engine, if_exists='replace', index=False)

                # convert_date_column_to_datetime(table_name, json_col)


In [69]:
def write_dataframes_to_database(data_frames, table_names, json_columns=None):
    """
    Writes a list of Pandas data frames to local database tables using SQLAlchemy. If a column name is provided as
    json_column, the contents of that column will be parsed as a JSON file and uploaded to a new table with
    the same name as the column.

    Args:
        data_frames (list of pandas.DataFrame): The data frames to write to the database.
        table_names (list of str): The names of the tables to create or overwrite in the database.
        json_columns (list of str): The names of the columns to parse as JSON files and upload to new tables.
    """

    # Create a SQLAlchemy engine for the local database
    engine = create_engine('sqlite:///timelycare.db')

    # Iterate over each data frame and table name
    for i, (df, table_name) in enumerate(zip(data_frames, table_names)):

        df = convert_all_strings_to_lc(df)
        
        # if json column, pop it out and upload separate.
        parse_upload_json_format(i, df, table_name, engine, json_columns)                    
                    
        # Write the remaining data frame to the specified database table
        df.to_sql(table_name, con=engine, if_exists='replace', index=False)


file_path       = "../data/prompt.xlsx"
worksheet_names = ["visit_table", "member_table", "provider_table"]
table_names     = ["visit", "member", "provider"]
json_columns    = [None, None, "License"]
tables_to_create = []

# Read the specified worksheets into data frames
data_frames = [read_excel_to_dataframe(file_path, sheet) for sheet in worksheet_names]

# Write the data frames to the database, with the specified JSON columns parsed into new tables
write_dataframes_to_database(data_frames, table_names, json_columns)



using https://jupyter-tutorial.readthedocs.io/en/stable/data-processing/postgresql/ipython-sql.html

In [71]:
%load_ext sql

# Connect to the local database
%sql sqlite:///timelycare.db

%%sql
SELECT * FROM visit_table


SyntaxError: invalid syntax (988781342.py, line 7)