In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import numpy as np


In [2]:
# Create a basic cleaning function
def clean_data(file_path, ted_variables, services_values, reason_values):

    # Load .csv file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Select columns
    teds_reduced_df = df[ted_variables]

    # Remove values 4(Transferred),5(incarcerated),6(Death),7(other)  from Reason column
    teds_reduced_df = teds_reduced_df[teds_reduced_df.REASON != 4]
    teds_reduced_df = teds_reduced_df[teds_reduced_df.REASON != 5]
    teds_reduced_df = teds_reduced_df[teds_reduced_df.REASON != 6]
    teds_reduced_df = teds_reduced_df[teds_reduced_df.REASON != 7]

    # Add sucessful column equal copied from REASON columnto 1 based on passed list reasons_values and the rest to 0
    teds_reduced_df['SUCCESSFUL'] = teds_reduced_df['REASON']

    # Change values in SUCCESSFUL column to 1 for passed list of reason_values chosen to indicate successful outcome
    for reason in reason_values:
        teds_reduced_df['SUCCESSFUL'] = teds_reduced_df['SUCCESSFUL'].replace({reason: 1}).astype(int) 
    
    # Change all other values in SUCCESSFUL column that aren't 1 to un sucessful 0.
    teds_reduced_df.loc[teds_reduced_df.SUCCESSFUL != 1, 'SUCCESSFUL'] = 0

    # Filter for AGES 18 and older.  Values > 2 based on codebook
    teds_clean = teds_reduced_df[teds_reduced_df.AGE > 2]
    
    # Take out all rows with value -9 (Missing/unknown/not collected/invalid) in any column
    teds_clean = teds_clean.replace({-9: np.nan}).dropna().astype(int)

    # Comnine race values 1,3,6,9 that are less than 1% to a new value of 10. Keep values 2, 4, 5, 7, 8 as is.
    races = [1,3,6,9]
    for race in races:
        teds_clean['RACE'] = teds_clean['RACE'].replace({race: 10}).astype(int) 

    # SERVICES column: select outpatient treatment, values 6 and 7,  Rhab values 2, 4, 5, 
    teds_clean = teds_clean[teds_clean["SERVICES"].isin(services_values)]

    # Return clean data frame
    return teds_clean

In [3]:
# Create file path list for input datasets
file_path_2015 = Path('Resources/tedsd_2015_puf.csv')
file_path_2016 = Path('Resources/tedsd_2016_puf.csv')
file_path_2017 = Path('Resources/tedsd_puf_2017.csv')
file_path_2018 = Path('Resources/tedsd_puf_2018.csv')
file_path_2019 = Path('Resources/tedsd_puf_2019.csv')
file_paths = [file_path_2015, file_path_2016, file_path_2017, file_path_2018, file_path_2019]

# Create output file path list to export cleaned dataframes to .csv files 
output_file_path_2015 = Path('Resources/teds_2015_cleaned.csv')
output_file_path_2016 = Path('Resources/teds_2016_cleaned.csv')
output_file_path_2017 = Path('Resources/teds_2017_cleaned.csv')
output_file_path_2018 = Path('Resources/teds_2018_cleaned.csv')
output_file_path_2019 = Path('Resources/teds_2019_cleaned.csv')
output_file_paths = [output_file_path_2015, output_file_path_2016, output_file_path_2017, output_file_path_2018, output_file_path_2019]

# Select features to use for analysis.  Must include 'SERVICES and 'REASON' 
ted_variables = ['DISYR', 'VET', 'REGION', 'FREQ_ATND_SELF_HELP', 'PSYPROB', 'DSMCRIT', 'ALCDRUG', 'PSOURCE', 'NOPRIOR', 'AGE',
                'RACE', 'GENDER', 'EDUC', 'MARSTAT', 'EMPLOY', 'LIVARAG', 'SERVICES', 'SUB1', 'SUB2','ROUTE1', 'FRSTUSE1', 'ALCFLG', 
                'COKEFLG', 'MARFLG', 'MTHAMFLG', 'LOS', 'OPSYNFLG', 'HERFLG', 'FREQ1', 'REASON']

# Select treatment services for analysis.  Values 6,7 are outpatient.  Values 3,4,5 are in patient.  Values 1,2 are 24 hour detox
services_values = [4,5] 

# Select values from REASON column to combine to a value of 1 for the target column SUCCESSFUL.  Must be a combination of 1,4,7.
reason_values = [1]

# Loop thru and import dataset .csv files, Call clean data funtion to get cleaned data frame and output dataframes to .csv files
# for machine learning model code
for (file_path, output_file_path) in zip(file_paths, output_file_paths):
    # Clean data with clean_data function with specified variables 
    teds_cleaned_df = clean_data(file_path, ted_variables, services_values, reason_values)
    # Print DISYR column for year after each read 
    print(teds_cleaned_df.iat[0,0])
    # Export cleaned dataframe to .csv file
    teds_cleaned_df.to_csv(output_file_path, encoding='utf-8', index=False)



# # Code for exporting to SQL database commented out
# # Import dependencies for SQL database export
# from sqlalchemy import create_engine
# from config import db_password
# # Create connection to PostgreSQL database
# db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/TEDS"
# engine = create_engine(db_string)
# # Set SQL output table names
# table_names = ['Teds_2015', 'Teds_2016', 'Teds_2017','Teds_2018','Teds_2019']

# # Loop thru and import dataset .csv files, Call clean data funtion to get cleaned data frame and 
# # output dataframes to SQL database for machine learning model code
# for (file_path, table_name) in zip(file_paths, table_names):
#     # Clean data with clean_data function with specified variables 
#     teds_cleaned_df = clean_data(file_path, ted_variables, services_values, reason_values)
#     # Print DISYR column for year after each read 
#     print(teds_cleaned_df.iat[0,0])
#     # Add teds_clean dataframe to a SQL database
#     teds_cleaned_df.to_sql(name=table_name, con=engine, index=False, if_exists='replace')




2015
2016
2017
2018
2019
