In [1]:
# !pip install imblearn




[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: C:\Users\ericm\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from numpy import std
import joblib
from joblib import load

In [2]:
df = pd.read_csv("cleaned_data.csv")

# Take random sample of dataframe to reduce size
df = df.sample(n=2000000, random_state=42)

In [19]:
print(df.columns)
print(df.shape)

Index(['AGE', 'GENDER', 'RACE', 'MARITAL_STATUS', 'EDUCATION',
       'EMPLOYMENT_AT_ADMISSION', 'LIVING_ARRANGEMENT_AT_ADMISSION',
       'ARRESTS_IN_30_DAYS_PRIOR_TO_ADMISSION', 'SERVICES_AT_ADMISSION',
       'REASON_FOR_DISCHARGE', 'PRIMARY_SOURCE_OF_REFERRAL',
       'PRIOR_TREATMENT_EPISODES', 'PRIMARY_SUBSTANCE_ABUSE',
       'FREQUENCY_OF_USE', 'AGE_AT_FIRST_USE', 'ALCOHOL_OR_DRUG_ABUSE',
       'DSM_DIAGNOSIS', 'PSYCHIATRIC_PROBLEM', 'HEALTH_INSURANCE',
       'PRIMARY_PAYMENT_METHOD', 'FREQUENCY_OF_SELF_HELP_ATTENDANCE', 'STATE'],
      dtype='object')
(2000000, 22)


## Feature Engineering

In [20]:
print(df['REASON_FOR_DISCHARGE'].value_counts(), '\n')
print(df['SERVICES_AT_ADMISSION'].value_counts(), '\n')
print(df['PRIOR_TREATMENT_EPISODES'].value_counts())

Treatment completed                                     836794
Dropped out of treatment                                499802
Transferred to another treatment program or facility    429350
Terminated by facility                                  113991
Other                                                    84383
Incarcerated                                             30904
Death                                                     4776
Name: REASON_FOR_DISCHARGE, dtype: int64 

Ambulatory, non-intensive outpatient                1004351
Detox, 24-hour, free-standing residential            298593
Ambulatory, intensive outpatient                     269067
Rehab/residential, short term (30 days or fewer)     206980
Rehab/residential, long term (more than 30 days)     151165
Detox, 24-hour, hospital inpatient                    49220
Ambulatory, detoxification                            15568
Rehab/residential, hospital (non-detox)                5056
Name: SERVICES_AT_ADMISSION, dtype: 

In [3]:
# Removing cases where the patient was transferred to another facility, we cannnot evaluate the effectiveness of the treatment if the patient was transferred to another facility
df = df[df['REASON_FOR_DISCHARGE'] != 'Transferred to another treatment program or facility']

## Create target variable

In [4]:
# Create the target variable. If the patient completed treatment and had no prior treatment episodes, they are considered a success. Otherwise, they are considered a failure.
df['SUCCESSFUL_TREATMENT'] = df.apply(lambda row: 1 if row['REASON_FOR_DISCHARGE'] == 'Treatment completed' and row['PRIOR_TREATMENT_EPISODES'] == "No prior treatment episode" else 0, axis=1)

print(df['SUCCESSFUL_TREATMENT'].value_counts())

0    1293487
1     277163
Name: SUCCESSFUL_TREATMENT, dtype: int64


In [5]:
# Get the initial splits
target = df['SUCCESSFUL_TREATMENT']
features = df.drop(['REASON_FOR_DISCHARGE', 'PRIOR_TREATMENT_EPISODES', 'SUCCESSFUL_TREATMENT'], axis=1)
features_one_hot = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(features_one_hot, target, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(1256520, 193) (314130, 193) (1256520,) (314130,)


## Balance the data 

In [24]:
# Outcomes are unbalanced, so balancing the data in this step via oversampling
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

print(y_train_res.value_counts())

0    1034928
1    1034928
Name: SUCCESSFUL_TREATMENT, dtype: int64


In [11]:
print(X_train_res.shape, y_train_res.shape)

(207054, 193) (207054,)


## Model baseline performance

In [34]:
# Define and train baseline models for a quick comparison to inform model selection and fine-tuning
models_to_train = {
    "naive_bayes": GaussianNB(),
    "logistic_regression": LogisticRegression(max_iter=1000),
    "random_forest": RandomForestClassifier()
}

for model_name, model in models_to_train.items():
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    print(f"Accuracy score for {model_name}: {accuracy_score(y_test, y_pred)}")
    print(f"f1 score for {model_name}: {f1_score(y_test, y_pred, average='macro')}")
    print(f"Precision score for {model_name}: {precision_score(y_test, y_pred, average='macro')}")
    print(f"Recall score for {model_name}: {recall_score(y_test, y_pred, average='macro')}")
    print(f"Standard Deviation of predictions for {model_name}: {std(y_pred)}", '\n')

Accuracy score for naive_bayes: 0.5293381720943559
f1 score for naive_bayes: 0.505049770859473
Precision score for naive_bayes: 0.5990615461066588
Recall score for naive_bayes: 0.6630622027018243
Standard Deviation of predictions for naive_bayes: 0.48957431251156075 

Accuracy score for logistic_regression: 0.7063858911915449
f1 score for logistic_regression: 0.6357156619705722
Precision score for logistic_regression: 0.6381629152344367
Recall score for logistic_regression: 0.7241470781216843
Standard Deviation of predictions for logistic_regression: 0.48603250854283236 

Accuracy score for random_forest: 0.8226116575939898
f1 score for random_forest: 0.678373621008752
Precision score for random_forest: 0.6898927366395385
Recall score for random_forest: 0.6693832991067857
Standard Deviation of predictions for random_forest: 0.36039241201465727 



## Tuning

In [36]:
# Fine-tune the best performing model (Random Forest) using a grid search and 5 fold cross validation

# Define the parameter grid
param_grid = {
    'n_estimators': [100], #100, 200, 400, 
    'max_depth': [12], #5, 15, None
    'min_samples_split': [2], #2, 5, 10
    'min_samples_leaf': [1] #1, 4, 6
}

# Define model
rf = RandomForestClassifier()

# Grid
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=3)

# Fit the grid search to the data
grid_search.fit(X_train_res, y_train_res)

# Print the best parameters
print("Best parameters: ", grid_search.best_params_)

# Save the model
joblib.dump(grid_search.best_estimator_, 'model_final.pkl')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


1 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ericm\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ericm\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\ensemble\_forest.py", line 476, in fit
    trees = Parallel(
  File "C:\Users\ericm\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache

Best parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


['model_final2.pkl']

In [38]:
# Load the model and evaluate performance on the test set
with open('model_final.pkl', 'rb') as file:
    model = load(file)

y_pred = model.predict(X_test)
print(f"Accuracy score for random forest: {accuracy_score(y_test, y_pred)}")
print(f"f1 score for random_forest: {f1_score(y_test, y_pred, average='macro')}")
print(f"Precision score for random forest: {precision_score(y_test, y_pred, average='macro')}")
print(f"Recall score for random_forest: {recall_score(y_test, y_pred, average='macro')}")
print(f"Standard Deviation of predictions for random_forest: {std(y_pred)}", '\n')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Accuracy score for random forest: 0.7101995988921784
f1 score for random_forest: 0.6386375807513665
Precision score for random forest: 0.6397266511451345
Recall score for random_forest: 0.7256373106835932
Standard Deviation of predictions for random_forest: 0.48490912154020654 



In [39]:
print(model.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 12, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


## Feature Ablation

In [6]:
age_columns = ['AGE_12-14','AGE_15-17','AGE_18-20','AGE_21-24','AGE_25-29','AGE_30-34','AGE_35-39','AGE_40-44',
               'AGE_45-49','AGE_50-54','AGE_55-64','AGE_65+']

gender_columns = ['GENDER_Female','GENDER_Male','GENDER_Not known']

race_columns = ['RACE_Alaskan Native','RACE_American Indian','RACE_Asian','RACE_Asian or Pacific Islander',
                'RACE_Black or African American','RACE_Native Hawaiian or Other Pacific Islander','RACE_Not known',
                'RACE_Other single race','RACE_Two or more races','RACE_White']

marriage_columns = ['MARITAL_STATUS_Divorced, widowed','MARITAL_STATUS_Never married','MARITAL_STATUS_Not known',
                    'MARITAL_STATUS_Now Married','MARITAL_STATUS_Separated']

education_columns = ['EDUCATION_1-3 years of college, university, or vocational school',
'EDUCATION_4 years of college, university, BA/BS, some postgraduate study, or more',
'EDUCATION_Grade 12 (or GED)',
'EDUCATION_Grades 9 to 11',
'EDUCATION_Less than one school grade, no schooling, nursery school, or kindergarten to Grade 8',
'EDUCATION_Not known']

employ_columns = ['EMPLOYMENT_AT_ADMISSION_Full time','EMPLOYMENT_AT_ADMISSION_Not in labor force',
                  'EMPLOYMENT_AT_ADMISSION_Not known','EMPLOYMENT_AT_ADMISSION_Part time',
                  'EMPLOYMENT_AT_ADMISSION_Unemployed']

living_columns = ['LIVING_ARRANGEMENT_AT_ADMISSION_Dependent living','LIVING_ARRANGEMENT_AT_ADMISSION_Homeless',
                  'LIVING_ARRANGEMENT_AT_ADMISSION_Independent living','LIVING_ARRANGEMENT_AT_ADMISSION_Not known']

arrests_columns = ['ARRESTS_IN_30_DAYS_PRIOR_TO_ADMISSION_None','ARRESTS_IN_30_DAYS_PRIOR_TO_ADMISSION_Not known',
                   'ARRESTS_IN_30_DAYS_PRIOR_TO_ADMISSION_Once','ARRESTS_IN_30_DAYS_PRIOR_TO_ADMISSION_Two or more times']

referral_columns = ['PRIMARY_SOURCE_OF_REFERRAL_Alcohol/drug use care provider',
                    'PRIMARY_SOURCE_OF_REFERRAL_Court/criminal justice referral/DUI/DWI',
                    'PRIMARY_SOURCE_OF_REFERRAL_Employer/EAP', 
                    'PRIMARY_SOURCE_OF_REFERRAL_Individual (includes self-referral)',
                    'PRIMARY_SOURCE_OF_REFERRAL_Not known',
                    'PRIMARY_SOURCE_OF_REFERRAL_Other community referral',
                    'PRIMARY_SOURCE_OF_REFERRAL_Other health care provider',
                    'PRIMARY_SOURCE_OF_REFERRAL_School (educational)']

sub_columns = ['PRIMARY_SUBSTANCE_ABUSE_Alcohol','PRIMARY_SUBSTANCE_ABUSE_Barbiturates','PRIMARY_SUBSTANCE_ABUSE_Benzodiazepines',
               'PRIMARY_SUBSTANCE_ABUSE_Cocaine/crack','PRIMARY_SUBSTANCE_ABUSE_Hallucinogens','PRIMARY_SUBSTANCE_ABUSE_Heroin',
               'PRIMARY_SUBSTANCE_ABUSE_Inhalants','PRIMARY_SUBSTANCE_ABUSE_Marijuana/hashish','PRIMARY_SUBSTANCE_ABUSE_Methamphetamine/speed',
               'PRIMARY_SUBSTANCE_ABUSE_Non-prescription methadone','PRIMARY_SUBSTANCE_ABUSE_None','PRIMARY_SUBSTANCE_ABUSE_Not known',
               'PRIMARY_SUBSTANCE_ABUSE_Other amphetamines','PRIMARY_SUBSTANCE_ABUSE_Other drugs','PRIMARY_SUBSTANCE_ABUSE_Other opiates and synthetics',
               'PRIMARY_SUBSTANCE_ABUSE_Other sedatives or hypnotics','PRIMARY_SUBSTANCE_ABUSE_Other stimulants',
               'PRIMARY_SUBSTANCE_ABUSE_Other tranquilizers','PRIMARY_SUBSTANCE_ABUSE_Over-the-counter medications',
               'PRIMARY_SUBSTANCE_ABUSE_PCP']

freq_columns = ['FREQUENCY_OF_USE_Daily use','FREQUENCY_OF_USE_No use in the past month','FREQUENCY_OF_USE_Not known',
                'FREQUENCY_OF_USE_Some use']

first_use_columns = ['AGE_AT_FIRST_USE_11 years and under',
'AGE_AT_FIRST_USE_12-14 years',
'AGE_AT_FIRST_USE_15-17 years',
'AGE_AT_FIRST_USE_18-20 years',
'AGE_AT_FIRST_USE_21-24 years',
'AGE_AT_FIRST_USE_25-29 years',
'AGE_AT_FIRST_USE_30 years and older',
'AGE_AT_FIRST_USE_Not known']

alcohol_columns = ['ALCOHOL_OR_DRUG_ABUSE_Alcohol and other drugs',
'ALCOHOL_OR_DRUG_ABUSE_Alcohol only',
'ALCOHOL_OR_DRUG_ABUSE_None',
'ALCOHOL_OR_DRUG_ABUSE_Other drugs only']

dsm_columns = ['DSM_DIAGNOSIS_Alcohol abuse',
'DSM_DIAGNOSIS_Alcohol dependence',
'DSM_DIAGNOSIS_Alcohol intoxication',
'DSM_DIAGNOSIS_Alcohol-induced disorder',
'DSM_DIAGNOSIS_Anxiety disorders',
'DSM_DIAGNOSIS_Attention deficit/disruptive behavior disorders',
'DSM_DIAGNOSIS_Bipolar disorders',
'DSM_DIAGNOSIS_Cannabis abuse',
'DSM_DIAGNOSIS_Cannabis dependence',
'DSM_DIAGNOSIS_Cocaine abuse',
'DSM_DIAGNOSIS_Cocaine dependence',
'DSM_DIAGNOSIS_Depressive disorders',
'DSM_DIAGNOSIS_Not known',
'DSM_DIAGNOSIS_Opioid abuse',
'DSM_DIAGNOSIS_Opioid dependence',
'DSM_DIAGNOSIS_Other mental health condition',
'DSM_DIAGNOSIS_Other substance abuse',
'DSM_DIAGNOSIS_Other substance dependence',
'DSM_DIAGNOSIS_Schizophrenia/other psychotic disorders',
'DSM_DIAGNOSIS_Substance-induced disorder']

psych_columns = ['PSYCHIATRIC_PROBLEM_No','PSYCHIATRIC_PROBLEM_Not known','PSYCHIATRIC_PROBLEM_Yes']

insur_columns = ['HEALTH_INSURANCE_Medicaid',
'HEALTH_INSURANCE_Medicare, other (e.g. TRICARE, CHAMPUS)',
'HEALTH_INSURANCE_None',
'HEALTH_INSURANCE_Not known',
'HEALTH_INSURANCE_Private insurance, Blue Cross/Blue Shield, HMO']

pay_columns = ['PRIMARY_PAYMENT_METHOD_Medicaid',
'PRIMARY_PAYMENT_METHOD_Medicare',
'PRIMARY_PAYMENT_METHOD_No charge (free, charity, special research, teaching)',
'PRIMARY_PAYMENT_METHOD_Not known',
'PRIMARY_PAYMENT_METHOD_Other',
'PRIMARY_PAYMENT_METHOD_Other government payments',
'PRIMARY_PAYMENT_METHOD_Private insurance (Blue Cross/Blue Shield, other health insurance, workers compensation)',
'PRIMARY_PAYMENT_METHOD_Self-pay']

selfhelp_columns = ['FREQUENCY_OF_SELF_HELP_ATTENDANCE_1-3 times in the past month',
'FREQUENCY_OF_SELF_HELP_ATTENDANCE_4-7 times in the past month',
'FREQUENCY_OF_SELF_HELP_ATTENDANCE_8-30 times in the past month',
'FREQUENCY_OF_SELF_HELP_ATTENDANCE_No attendance',
'FREQUENCY_OF_SELF_HELP_ATTENDANCE_Not known',
'FREQUENCY_OF_SELF_HELP_ATTENDANCE_Some attendance, frequency is unknown']

state_columns = ['STATE_Alabama','STATE_Alaska','STATE_Arizona','STATE_Arkansas','STATE_California','STATE_Colorado',
                 'STATE_Connecticut','STATE_Delaware','STATE_District of Columbia','STATE_Florida','STATE_Georgia',
                 'STATE_Hawaii','STATE_Idaho','STATE_Illinois','STATE_Indiana','STATE_Iowa','STATE_Kansas','STATE_Kentucky',
                 'STATE_Louisiana','STATE_Maine','STATE_Maryland','STATE_Massachusetts','STATE_Michigan','STATE_Minnesota',
                 'STATE_Mississippi','STATE_Missouri','STATE_Montana','STATE_Nebraska','STATE_Nevada','STATE_New Hampshire',
                 'STATE_New Jersey','STATE_New Mexico','STATE_New York','STATE_North Carolina','STATE_North Dakota','STATE_Ohio',
                 'STATE_Oklahoma','STATE_Pennsylvania','STATE_Puerto Rico','STATE_Rhode Island','STATE_South Carolina',
                 'STATE_South Dakota','STATE_Tennessee','STATE_Texas','STATE_Utah','STATE_Vermont','STATE_Virginia',
                 'STATE_Washington','STATE_Wisconsin','STATE_Wyoming']

feature_lists = [age_columns, gender_columns, race_columns, marriage_columns, education_columns, employ_columns,
                living_columns, arrests_columns, referral_columns, sub_columns, freq_columns, first_use_columns,
                alcohol_columns, dsm_columns, psych_columns, insur_columns, pay_columns, selfhelp_columns, state_columns]

In [7]:
# Loop through each feature group and remove one at a time to see impact on model performance
for x in feature_lists:
    temp_df = features_one_hot.drop(columns = x)
    
    X_train_abl, X_test_abl, y_train_abl, y_test_abl = train_test_split(temp_df, target, test_size=0.2, random_state=42)
    ros = RandomOverSampler(random_state=42)
    X_train_res_abl, y_train_res_abl = ros.fit_resample(X_train_abl, y_train_abl)
    
    # Define the parameter grid
    param_grid = {
        'n_estimators': [100], #100, 200, 400, 
        'max_depth': [12], #5, 15, None
        'min_samples_split': [2], #2,10
        'min_samples_leaf': [1] #1,4
    }

    # Define model
    rf_abl = RandomForestClassifier()

    # Grid
    grid_search_abl = GridSearchCV(estimator=rf_abl, param_grid=param_grid, cv=5, n_jobs=-1, verbose=3)

    # Fit the grid search to the data
    grid_search_abl.fit(X_train_res_abl, y_train_res_abl)

    # Best params
    model = grid_search_abl.best_estimator_

    y_pred_abl = model.predict(X_test_abl)
    
    print(f"Accuracy score for random forest removing {x}: {accuracy_score(y_test_abl, y_pred_abl)}")
        
    

Fitting 5 folds for each of 1 candidates, totalling 5 fits
