In [11]:
import polars as pl
import duckdb as db
import os

# Transforming Data

In [12]:
def clean_referralfile(file_path: str) -> pl.DataFrame:
    """Expects csv file type with full file path, returns a spark data frame"""
    
    df = pl.read_csv(file=file_path)

    df_renamed = df.rename(
        {
            'Referring':'Referring Provider'
            , 'Referring_duplicated_0': 'Referring Provider NPI'
            , 'Referral': 'Referral Date'
            , 'pat': 'pat Status'
            , 'Referred to': 'Referred to Specialist'
            , 'Specialist': 'Specialist NPI'
            , 'Visit': 'Visit Status'
            , 'Health': 'Health Plan'
        }
    )

    # Drop first row
    df_drop_rows = df_renamed[1:, :]

    # Create new column 'Update_DT' where value is seperated DT in the next row 
    df_fill = df_drop_rows.with_columns(Update_DT = df_drop_rows['Last Update'].shift(-1))

    # Add column for file_name
    df_name = df_fill.with_columns(file_source = pl.lit(file_path))

    # Create list from str
    df_list = df_name.with_columns(pl.col('Diagnosis').str.split(','))

    # Create struct from list
    df_strct = df_list.with_columns(pl.col('Diagnosis').arr.to_struct())

    df_expl = df_strct.unnest('Diagnosis').rename({'field_0': 'Diagnosis'})

    # drop all null values now that we've cleaned the data
    df_clean = df_expl.filter(~pl.all(pl.col('Center').is_null()))


    return df_clean

In [13]:
def write_parquet(spark_df: pl.DataFrame, file_path: str):
    """writes a spark dataframe to parquet file path"""

    spark_df.write_parquet(file=file_path)

In [14]:
def clean_write_file(inc_file_path: str, out_file_path):
    """wrapper function for cleaning and writing referral files to parquet"""

    clean = clean_referralfile(inc_file_path)
    write_parquet(clean, out_file_path)

In [19]:
def clean_file_names(in_path: str = 'data/raw_referrals/'):
    """cleans the file names within 'data/raw_referrals/ dir"""
    
    for file_name in os.listdir(in_path):
        f_name = file_name.upper().replace(' - ', '-').replace('TEXAS', 'TX')
        os.rename(in_path+file_name, in_path+f_name)

In [20]:
def iter_clean_write_files(in_path: str = 'data/raw_referrals/', out_path: str = 'data/clean_referrals/'):
    """iteratively cleans file names then writes to clean_referrals dir as parquet"""

    for file_name in os.listdir(in_path):
        f_name = file_name[:-4]
        clean_write_file(in_path+file_name, out_path+f_name+'.parquet')

# Output Files

## Parquet Files

In [50]:
clean_write_file('data/raw_referrals/FL referrals 3.1-3.29.csv', 'data/clean_referrals/fl_2023_03_01-2023_03_29.parquet')
clean_write_file('data/raw_referrals/TX Referrals 3.1-3.29.csv', 'data/clean_referrals/tx_2023_03_01-2023_03_29.parquet')

# Add Data to DuckDb

In [53]:
connection = db.connect("data/referral.db")