In [2]:
# Import dependencies
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import numpy as np
import sklearn
import keras




In [3]:
# Create a basic cleaning function
def clean_data(file_path, ted_variables, services_values, reason_values):

    # Load .csv file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Select columns
    teds_reduced_df = df[ted_variables]

    # Remove value 6 'Death' from Reason column 
    teds_reduced_df = teds_reduced_df[teds_reduced_df.REASON != 6]
    teds_reduced_df = teds_reduced_df[teds_reduced_df.REASON != 5]
    teds_reduced_df = teds_reduced_df[teds_reduced_df.REASON != 7]
    # Remove value 4 'Transferred to another treatment program' from Reason column 
    teds_reduced_df = teds_reduced_df[teds_reduced_df.REASON != 4]

    # Add sucessful column equal copied from REASON columnto 1 based on passed list reasons_values and the rest to 0
    teds_reduced_df['SUCCESSFUL'] = teds_reduced_df['REASON']

    # Change values in SUCCESSFUL column to 1 for passed list of reason_values chosen to indicate successful outcome
    for reason in reason_values:
        teds_reduced_df['SUCCESSFUL'] = teds_reduced_df['SUCCESSFUL'].replace({reason: 1}).astype(int) 
    
    # Change all other values in SUCCESSFUL column that aren't 1 to un sucessful 0.
    teds_reduced_df.loc[teds_reduced_df.SUCCESSFUL != 1, 'SUCCESSFUL'] = 0

    # Filter for AGES 18 and older.  Values > 2 based on codebook
    teds_clean = teds_reduced_df[teds_reduced_df.AGE > 2]
    
    # Take out all rows with value -9 (Missing/unknown/not collected/invalid) in any column
    teds_clean = teds_clean.replace({-9: np.nan}).dropna().astype(int)

    # Comnine race values 1,3,6,9 that are less than 1% to a new value of 10. Keep values 2, 4, 5, 7, 8 as is.
    races = [1,3,6,9]
    for race in races:
        teds_clean['RACE'] = teds_clean['RACE'].replace({race: 10}).astype(int) 

    # SERVICES column: select outpatient treatment, values 6 and 7,  Rhab values 2, 4, 5, 
    teds_clean = teds_clean[teds_clean["SERVICES"].isin(services_values)]

    # Return clean data frame
    return teds_clean

In [4]:
# # Function to import dataset
# def load_data(file_path):

#     # Read dataset into dataframe 
#     teds_cleaned_df = pd.read_csv(file_path)

#     return teds_cleaned_df    

In [5]:
# Function to split our preprocessed data into our features and target arrays.  Get training and testing values and scale.
def prepare_data(teds_cleaned_df):
    
    # Split our preprocessed data into our features and target arrays
    y = teds_cleaned_df["SUCCESSFUL"].values
    X = teds_cleaned_df.drop(["REASON","SUCCESSFUL"],axis=1).values

    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

    # Create a StandardScaler instance
    scaler = StandardScaler()

    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)

    # Scale the traingin and test data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Scale the full dataset 
    X_test_full_scaled = X_scaler.transform(X)

    return X_train_scaled, y_train, X_test_scaled, y_test, X_test_full_scaled, y


In [6]:
# Define the model function  - deep neural net
def define_model(number_input_features, hidden_nodes_layer1, hidden_nodes_layer2):
    keras.backend.clear_session()
    nn = tf.keras.models.Sequential()

    # First hidden layer
    nn.add(
        tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
    )

    # Second hidden layer
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))  

    # Output layer
    nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    return nn

# Load and Clean the Data

In [7]:
# Get clean data 

# Create file path list for input datasets
file_path_2015 = Path('Resources/tedsd_2015_puf.csv')
file_path_2016 = Path('Resources/tedsd_2016_puf.csv')
file_path_2017 = Path('Resources/tedsd_puf_2017.csv')
file_path_2018 = Path('Resources/tedsd_puf_2018.csv')
file_path_2019 = Path('Resources/tedsd_puf_2019.csv')
file_paths = [file_path_2015, file_path_2016, file_path_2017, file_path_2018, file_path_2019]

# Create output file path list to export cleaned dataframes to .csv files 
output_file_path_2015 = Path('Resources/teds_2015_cleaned.csv')
output_file_path_2016 = Path('Resources/teds_2016_cleaned.csv')
output_file_path_2017 = Path('Resources/teds_2017_cleaned.csv')
output_file_path_2018 = Path('Resources/teds_2018_cleaned.csv')
output_file_path_2019 = Path('Resources/teds_2019_cleaned.csv')
output_file_paths = [output_file_path_2015, output_file_path_2016, output_file_path_2017, output_file_path_2018, output_file_path_2019]

# Select features to use for analysis.  Must include 'SERVICES and 'REASON' 
ted_variables = ['DISYR', 'VET', 'REGION', 'FREQ_ATND_SELF_HELP', 'PSYPROB', 'DSMCRIT', 'ALCDRUG', 'PSOURCE', 'NOPRIOR', 'AGE',
                'RACE', 'GENDER', 'EDUC', 'MARSTAT', 'EMPLOY', 'LIVARAG', 'SERVICES', 'SUB1', 'SUB2','ROUTE1', 'FRSTUSE1', 'ALCFLG', 
                'COKEFLG', 'MARFLG', 'MTHAMFLG', 'LOS', 'OPSYNFLG', 'HERFLG', 'FREQ1', 'REASON']

# Select treatment services for analysis.  Values 6,7 are outpatient.  Values 3,4,5 are in patient.  Values 1,2 are 24 hour detox
services_values = [4,5] 

# Select values from REASON column to combine to a value of 1 for the target column SUCCESSFUL.  Must be a combination of 1,4,7.
reason_values = [1]

# Clean data with clean_data function with specified variables 
teds_cleaned_df = clean_data(file_path_2019, ted_variables, services_values, reason_values)
teds_cleaned_df.head()

Unnamed: 0,DISYR,VET,REGION,FREQ_ATND_SELF_HELP,PSYPROB,DSMCRIT,ALCDRUG,PSOURCE,NOPRIOR,AGE,...,ALCFLG,COKEFLG,MARFLG,MTHAMFLG,LOS,OPSYNFLG,HERFLG,FREQ1,REASON,SUCCESSFUL
248,2019,2,4,1,1,5,2,3,1,6,...,0,0,1,1,12,0,1,3,2,0
251,2019,2,4,5,1,5,3,1,1,5,...,1,0,0,1,32,0,1,2,1,1
257,2019,2,4,1,1,4,1,1,1,9,...,1,0,0,0,32,0,0,3,2,0
261,2019,2,4,1,1,4,1,1,1,6,...,1,0,0,0,14,0,0,3,2,0
277,2019,2,4,1,2,4,3,1,1,11,...,1,0,1,0,8,0,0,3,2,0


In [8]:
teds_cleaned_df['SUCCESSFUL'].value_counts()

1    74939
0    44388
Name: SUCCESSFUL, dtype: int64

# Loop thru Epochs and Class Weights to optimize the model

In [9]:
# Prepare dataset for model
X_train_scaled, y_train, X_test_scaled, y_test, X_test_full_scaled, y = prepare_data(teds_cleaned_df)

# Define neural network model parameters
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

# Get the model based defined parameters
nn = define_model(number_input_features, hidden_nodes_layer1, hidden_nodes_layer2)

# Check the structure of the model
# nn.summary()

# Loop to evaluate model performance vs epoch number and weight values
# Set the values of the epochs list to see the effect on model perfomance
epoch_list = [5]

# Define empty dataframe to hold model evaluation results for training and testing results
nn_model_results_wt = pd.DataFrame(columns = ['Weight', 'Training loss', 'Training accuracey', 'Testing loss',
                                'Testing accuracy', 'Testing Precision']) 

# Define empty data frame to hold model evaluation results on entire dataset
nn_model_results_wt_fd = pd.DataFrame(columns = ['Weight', 'Testing loss',
                                'Testing accuracy', 'Precision']) 

for ep in epoch_list:
    print(f"\nEpochs {ep}")

    # Define weight values to evaluate model 
    wts = [.3, .4, .5, .6]
    # wts = [.4]

    # Loop through weights list to determine optimal weighting for model fit
    for wt in wts:
        print(f"\n\nWeight {wt}")
        weights = {0:1, 1:wt}
        fit_model = nn.fit(X_train_scaled,y_train, class_weight=weights, epochs=ep, verbose=0)

        # Evaluate the model using the test data
        model_loss_test, model_accuracy_test = nn.evaluate(X_test_scaled,y_test,verbose=0)
        print(f"Testing Loss: {model_loss_test}, Testing Accuracy: {model_accuracy_test}")

        # Evaluate the model using the training data
        model_loss_train, model_accuracy_train = nn.evaluate(X_train_scaled,y_train,verbose=0)
        print(f"Training Loss: {model_loss_train}, Training Accuracy: {model_accuracy_train}")

        # Get confusion matrix
        y_pred = nn.predict(X_test_scaled)
        confusion_matrix = sklearn.metrics.confusion_matrix(y_test, np.rint(y_pred))
        
        # Create a DataFrame from the confusion matrix.
        nn_cm_df = pd.DataFrame(
            confusion_matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
        print(nn_cm_df)

        # Export confusion matrix to .csv file per weight
        nn_cm_wt_filename = 'Resources/ML_model_results/nn_cm_wt_' + str(wt) +'.csv'
        output_path = Path(nn_cm_wt_filename)
        nn_cm_df.to_csv(output_path, encoding='utf-8')
        
        # Calculate precision 
        tp = nn_cm_df.loc['Actual 1', 'Predicted 1']
        fp = nn_cm_df.loc['Actual 0', 'Predicted 1']
        precision = tp / (tp+fp)
        print(f"Precision: {precision}")

        # Store training and testing data to dataframe
        nn_model_results_wt.loc[len(nn_model_results_wt)] = [wt, model_loss_train, model_accuracy_train, 
                                                    model_loss_test, model_accuracy_test, precision]

        # Evaluate the model using the full dataset
        model_loss, model_accuracy = nn.evaluate(X_test_full_scaled,y,verbose=0)
        print(f"\nTesting Loss full dataset: {model_loss}, Testing Accuracy full dataset: {model_accuracy}")
        
        # Get confusion matrix
        y_pred = nn.predict(X_test_full_scaled)
        confusion_matrix = sklearn.metrics.confusion_matrix(y, np.rint(y_pred))

        # Create a DataFrame from the confusion matrix.
        nn_cm_df = pd.DataFrame(
            confusion_matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
        print(nn_cm_df)

        # Export confusion matrix for full dataset run per weight
        nn_cm_full_wt_filename = 'Resources/ML_model_results/nn_cm_full_wt' + str(wt) +'.csv'
        output_path = Path(nn_cm_full_wt_filename)
        nn_cm_df.to_csv(output_path, encoding='utf-8')

        tp = nn_cm_df.loc['Actual 1', 'Predicted 1']
        fp = nn_cm_df.loc['Actual 0', 'Predicted 1']
        precision = tp / (tp+fp)
        print(f"Precision: {precision}")

        nn_model_results_wt_fd.loc[len(nn_model_results_wt_fd)] = [wt, model_loss, model_accuracy, precision]



Epochs 5


Weight 0.3
Testing Loss: 0.5945817828178406, Testing Accuracy: 0.6898967623710632
Training Loss: 0.5925002694129944, Training Accuracy: 0.6906084418296814
          Predicted 0  Predicted 1
Actual 0         9524         1573
Actual 1         7678        11057
Precision: 0.8754552652414885

Testing Loss full dataset: 0.5930198431015015, Testing Accuracy full dataset: 0.6904305219650269
          Predicted 0  Predicted 1
Actual 0        38314         6074
Actual 1        30866        44073
Precision: 0.8788761042534947


Weight 0.4
Testing Loss: 0.5371071696281433, Testing Accuracy: 0.7292169332504272
Training Loss: 0.5323128700256348, Training Accuracy: 0.7339404225349426
          Predicted 0  Predicted 1
Actual 0         9078         2019
Actual 1         6059        12676
Precision: 0.8626063286832256

Testing Loss full dataset: 0.5335120558738708, Testing Accuracy full dataset: 0.7327595353126526
          Predicted 0  Predicted 1
Actual 0        36573         7815
Actua

In [10]:
# Show model results vs weight values
nn_model_results_wt.head()

Unnamed: 0,Weight,Training loss,Training accuracey,Testing loss,Testing accuracy,Testing Precision
0,0.3,0.5925,0.690608,0.594582,0.689897,0.875455
1,0.4,0.532313,0.73394,0.537107,0.729217,0.862606
2,0.5,0.506694,0.747539,0.512702,0.742726,0.852005
3,0.6,0.482808,0.769138,0.48849,0.768336,0.832583


In [11]:
# Show model results tested on full dataset
nn_model_results_wt_fd.head()

Unnamed: 0,Weight,Testing loss,Testing accuracy,Precision
0,0.3,0.59302,0.690431,0.878876
1,0.4,0.533512,0.73276,0.86682
2,0.5,0.508195,0.746336,0.855619
3,0.6,0.484228,0.768937,0.832834
