In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf

In [2]:
# Create a basic cleaning function
def clean_data(file_path, ted_variables, services_values, reason_values):

    # Load .csv file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Select columns
    teds_reduced_df = df[ted_variables]

    # Remove value 6 'Death' from Reason column 
    teds_reduced_df = teds_reduced_df[teds_reduced_df.REASON != 6]

    # Add sucessful column equal copied from REASON columnto 1 based on passed list reasons_values and the rest to 0
    teds_reduced_df['SUCCESSFUL'] = teds_reduced_df['REASON']

    # Change values in SUCCESSFUL column to 1 for passed list of reason_values chosen to indicate successful outcome
    for reason in reason_values:
        teds_reduced_df['SUCCESSFUL'] = teds_reduced_df['SUCCESSFUL'].replace({reason: 1}).astype(int) 
    
    # Change all other values in SUCCESSFUL column that aren't 1 to un sucessful 0.
    teds_reduced_df.loc[teds_reduced_df.SUCCESSFUL != 1, 'SUCCESSFUL'] = 0

    # Filter for AGES 18 and older.  Values > 2 based on codebook
    teds_clean = teds_reduced_df[teds_reduced_df.AGE > 2]
    
    # Take out all rows with value -9 (Missing/unknown/not collected/invalid) in any column
    teds_clean = teds_clean.replace({-9: np.nan}).dropna().astype(int)

    # Comnine race values 1,3,6,9 that are less than 1% to a new value of 10. Keep values 2, 4, 5, 7, 8 as is.
    races = [1,3,6,9]
    for race in races:
        teds_clean['RACE'] = teds_clean['RACE'].replace({race: 10}).astype(int) 

    # SERVICES column: select outpatient treatment, values 6 and 7,  Rhab values 2, 4, 5, 
    teds_clean = teds_clean[teds_clean["SERVICES"].isin(services_values)]

    # Return 
    return teds_clean

In [3]:
#Set file path for input dataset
file_path = Path('Resources/tedsd_puf_2019.csv')

# Select features to use for analysis.  Must include 'SERVICES and 'REASON' 
ted_variables = ['VET', 'REGION', 'FREQ_ATND_SELF_HELP', 'PSYPROB', 'DSMCRIT', 'ALCDRUG', 'PSOURCE', 'NOPRIOR', 'AGE',
                'RACE', 'GENDER', 'EDUC', 'MARSTAT', 'EMPLOY', 'LIVARAG', 'SERVICES', 'SUB1', 'SUB2','ROUTE1', 'FRSTUSE1', 'ALCFLG', 
                'COKEFLG', 'MARFLG', 'MTHAMFLG', 'OPSYNFLG', 'HERFLG', 'FREQ1', 'REASON']

# Select treatment services for analysis
services_values = [6,7] 

# Select values from REASON column to combine to a value of 1 for the target column SUCCESSFUL.  Can only be any combination of 1,4,7.
reason_values = [1]

# Call clean data funtion to get cleaned data frame for output to machine learning model code
teds_cleaned_df = clean_data(file_path, ted_variables, services_values, reason_values)
teds_cleaned_df.head()

Unnamed: 0,VET,REGION,FREQ_ATND_SELF_HELP,PSYPROB,DSMCRIT,ALCDRUG,PSOURCE,NOPRIOR,AGE,RACE,...,FRSTUSE1,ALCFLG,COKEFLG,MARFLG,MTHAMFLG,OPSYNFLG,HERFLG,FREQ1,REASON,SUCCESSFUL
0,2,4,1,1,19,1,1,0,4,10,...,5,1,0,0,0,0,0,2,1,1
1,2,4,1,1,4,3,1,0,5,10,...,3,1,0,1,0,0,0,2,3,0
3,2,4,4,1,4,3,1,1,5,10,...,3,1,0,1,0,0,1,1,1,1
4,2,4,1,1,4,1,1,1,11,8,...,7,1,0,0,0,0,0,1,3,0
5,2,4,1,1,19,3,1,0,8,10,...,2,1,0,0,0,0,0,2,3,0


In [4]:
# Split our preprocessed data into our features and target arrays

y = teds_cleaned_df["SUCCESSFUL"].values
X = teds_cleaned_df.drop(["SUCCESSFUL","REASON"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

  after removing the cwd from sys.path.


In [5]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 25
hidden_nodes_layer2 = 20





nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))



# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 25)                700       
                                                                 
 dense_1 (Dense)             (None, 20)                520       
                                                                 
 dense_2 (Dense)             (None, 1)                 21        
                                                                 
Total params: 1,241
Trainable params: 1,241
Non-trainable params: 0
_________________________________________________________________


In [7]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs= 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [8]:
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

3876/3876 - 1s - loss: 0.4656 - accuracy: 0.7852 - 1s/epoch - 332us/step
Loss: 0.46561330556869507, Accuracy: 0.7851613759994507
