In [1]:
#pip install imblearn

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
import joblib

In [3]:

#%cd MADS-Capstone

In [4]:
df = pd.read_csv("cleaned_data.csv")

df.head()

Unnamed: 0,YEAR_OF_DISCHARGE,AGE,GENDER,RACE,MARITAL_STATUS,EDUCATION,EMPLOYMENT_AT_ADMISSION,EMPLOYMENT_AT_DISCHARGE,LIVING_ARRANGEMENT_AT_ADMISSION,LIVING_ARRANGEMENT_AT_DISCHARGE,...,PRIMARY_SUBSTANCE_ABUSE,FREQUENCY_OF_USE,AGE_AT_FIRST_USE,ALCOHOL_OR_DRUG_ABUSE,DSM_DIAGNOSIS,PSYCHIATRIC_PROBLEM,HEALTH_INSURANCE,PRIMARY_PAYMENT_METHOD,FREQUENCY_OF_SELF_HELP_ATTENDANCE,STATE
0,2017,45-49,Female,White,"Divorced, widowed",Grade 12 (or GED),Full time,Full time,Independent living,Independent living,...,Methamphetamine/speed,No use in the past month,30 years and older,Other drugs only,Depressive disorders,No,,Medicaid,8-30 times in the past month,Alaska
1,2017,35-39,Male,White,Never married,"4 years of college, university, BA/BS, some po...",Full time,Full time,Independent living,Independent living,...,Alcohol,Some use,18-20 years,Alcohol only,Alcohol abuse,No,"Medicare, other (e.g. TRICARE, CHAMPUS)","Private insurance (Blue Cross/Blue Shield, oth...",No attendance,Alaska
2,2017,30-34,Male,Two or more races,Never married,Grade 12 (or GED),Full time,Full time,Dependent living,Independent living,...,Alcohol,Some use,11 years and under,Alcohol and other drugs,Alcohol dependence,No,,Medicaid,No attendance,Alaska
3,2017,25-29,Female,Alaskan Native,Now Married,Grades 9 to 11,Unemployed,Unemployed,Independent living,Independent living,...,Other drugs,Daily use,18-20 years,Alcohol and other drugs,Alcohol dependence,No,Medicaid,Medicaid,No attendance,Alaska
4,2017,25-29,Female,Alaskan Native,Now Married,Grades 9 to 11,Unemployed,Unemployed,Independent living,Independent living,...,Alcohol,Some use,12-14 years,Alcohol only,Alcohol dependence,Yes,Medicaid,Medicaid,No attendance,Alaska


In [5]:
print(df.columns)
print(df.shape)

Index(['YEAR_OF_DISCHARGE', 'AGE', 'GENDER', 'RACE', 'MARITAL_STATUS',
       'EDUCATION', 'EMPLOYMENT_AT_ADMISSION', 'EMPLOYMENT_AT_DISCHARGE',
       'LIVING_ARRANGEMENT_AT_ADMISSION', 'LIVING_ARRANGEMENT_AT_DISCHARGE',
       'ARRESTS_IN_30_DAYS_PRIOR_TO_ADMISSION',
       'ARRESTS_IN_30_DAYS_PRIOR_TO_DISCHARGE', 'SERVICES_AT_ADMISSION',
       'SERVICES_AT_DISCHARGE', 'REASON_FOR_DISCHARGE', 'LENGTH_OF_STAY',
       'PRIMARY_SOURCE_OF_REFERRAL', 'PRIOR_TREATMENT_EPISODES',
       'PRIMARY_SUBSTANCE_ABUSE', 'FREQUENCY_OF_USE', 'AGE_AT_FIRST_USE',
       'ALCOHOL_OR_DRUG_ABUSE', 'DSM_DIAGNOSIS', 'PSYCHIATRIC_PROBLEM',
       'HEALTH_INSURANCE', 'PRIMARY_PAYMENT_METHOD',
       'FREQUENCY_OF_SELF_HELP_ATTENDANCE', 'STATE'],
      dtype='object')
(1035841, 28)


## Feature Engineering

In [6]:
print(df['REASON_FOR_DISCHARGE'].value_counts(), '\n')
print(df['SERVICES_AT_DISCHARGE'].value_counts(), '\n')
print(df['PRIOR_TREATMENT_EPISODES'].value_counts())

Treatment completed                                     382503
Transferred to another treatment program or facility    345744
Dropped out of treatment                                180620
Terminated by facility                                   57002
Other                                                    52134
Incarcerated                                             16166
Death                                                     1672
Name: REASON_FOR_DISCHARGE, dtype: int64 

Ambulatory, non-intensive outpatient                465902
Ambulatory, intensive outpatient                    209535
Rehab/residential, short term (30 days or fewer)    139983
Detox, 24-hour, free-standing residential           131314
Rehab/residential, long term (more than 30 days)     57799
Ambulatory, detoxification                           18940
Detox, 24-hour, hospital inpatient                   11201
Rehab/residential, hospital (non-detox)               1167
Name: SERVICES_AT_DISCHARGE, dtype: int64 



In [7]:
df = df[df['REASON_FOR_DISCHARGE'] != 'Transferred to another treatment program or facility']

In [8]:
# Create target variable. If the patient completed treatment and had no prior treatment episodes, they are considered a success. Otherwise, they are considered a failure.
df['SUCCESSFUL_TREATMENT'] = df.apply(lambda row: 1 if row['REASON_FOR_DISCHARGE'] == 'Treatment completed' and row['PRIOR_TREATMENT_EPISODES'] == "No prior treatment episode" else 0, axis=1)

print(df['SUCCESSFUL_TREATMENT'].value_counts())

0    573270
1    116827
Name: SUCCESSFUL_TREATMENT, dtype: int64


## Modeling

### Initial evaluation/model baselines

In [9]:
df_one_hot = pd.get_dummies(df)
df_one_hot.shape

(690097, 224)

In [10]:
target = df['SUCCESSFUL_TREATMENT']
features = df.drop(['REASON_FOR_DISCHARGE', 'PRIOR_TREATMENT_EPISODES', 'SUCCESSFUL_TREATMENT'], axis=1)
features_one_hot = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(features_one_hot, target, test_size=0.2, random_state=42)


In [11]:
# models_to_train = {
#     "naive_bayes": GaussianNB(),
#     "logistic_regression": LogisticRegression(max_iter=1000),
#     "random_forest": RandomForestClassifier()
# }

# for model_name, model in models_to_train.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     print(f"Accuracy score for {model_name}: {accuracy_score(y_test, y_pred)}")
#     print(f"f1 score for {model_name}: {f1_score(y_test, y_pred)}", '\n')

## Balance the data 

In [12]:
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

print(y_train_res.value_counts())

0    458567
1    458567
Name: SUCCESSFUL_TREATMENT, dtype: int64


In [13]:
# for model_name, model in models_to_train.items():
#     model.fit(X_train_res, y_train_res)
#     y_pred = model.predict(X_test)
#     print(f"Accuracy score for {model_name}: {accuracy_score(y_test, y_pred)}")
#     print(f"Confusion matrix: \n{confusion_matrix(y_test, y_pred)}\n")

## Tuning

In [18]:
# Define the parameter grid
param_grid = {
    'n_estimators': [250, 500], #100, 200, 400, 
    'max_depth': [5,10], #5, 15, None
    'min_samples_split': [2, 5], #2,10
    'min_samples_leaf': [2,4] #1,4
}

# Define model
rf = RandomForestClassifier()

# Grid
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=3)

# Fit the grid search to the data
grid_search.fit(X_train_res, y_train_res)

# Print the best parameters
print("Best parameters: ", grid_search.best_params_)

# Save the model
joblib.dump(grid_search.best_estimator_, 'best_random_forest.pkl')

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 5/5] END max_depth=5, min_samples_leaf=2, min_samples_split=2, n_estimators=250;, score=0.655 total time= 5.3min
[CV 1/5] END max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=250;, score=0.657 total time= 5.0min
[CV 2/5] END max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=500;, score=0.659 total time=10.2min
[CV 3/5] END max_depth=5, min_samples_leaf=4, min_samples_split=2, n_estimators=500;, score=0.659 total time= 9.7min
[CV 2/5] END max_depth=5, min_samples_leaf=4, min_samples_split=5, n_estimators=500;, score=0.659 total time= 9.8min
[CV 1/5] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=500;, score=0.691 total time=19.0min
[CV 3/5] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=500;, score=0.690 total time=21.0min
[CV 5/5] END max_depth=10, min_samples_leaf=4, min_samples_split=2, n_estimators=500;, score=0.688 total time=20.5min


['best_random_forest.pkl']