In [1]:
#pip install imblearn


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
import joblib
from joblib import load

In [3]:

# %cd ..
# %cd MADS/Capstone/MADS-Capstone
%pwd

'c:\\Users\\ericm\\OneDrive\\Documents\\MADS\\Capstone\\MADS-Capstone'

In [4]:
df = pd.read_csv("cleaned_data.csv")

df.head()

Unnamed: 0,AGE,GENDER,RACE,MARITAL_STATUS,EDUCATION,EMPLOYMENT_AT_ADMISSION,LIVING_ARRANGEMENT_AT_ADMISSION,ARRESTS_IN_30_DAYS_PRIOR_TO_ADMISSION,SERVICES_AT_ADMISSION,REASON_FOR_DISCHARGE,...,PRIMARY_SUBSTANCE_ABUSE,FREQUENCY_OF_USE,AGE_AT_FIRST_USE,ALCOHOL_OR_DRUG_ABUSE,DSM_DIAGNOSIS,PSYCHIATRIC_PROBLEM,HEALTH_INSURANCE,PRIMARY_PAYMENT_METHOD,FREQUENCY_OF_SELF_HELP_ATTENDANCE,STATE
0,25-29,Female,White,Never married,Grade 12 (or GED),Part time,Independent living,,"Ambulatory, non-intensive outpatient",Terminated by facility,...,Alcohol,Some use,15-17 years,Alcohol only,Not known,Yes,Medicaid,Medicaid,No attendance,Alaska
1,45-49,Female,White,"Divorced, widowed",Grade 12 (or GED),Part time,Independent living,,"Ambulatory, non-intensive outpatient",Treatment completed,...,Alcohol,Daily use,Not known,Alcohol and other drugs,Alcohol dependence,Yes,Medicaid,Medicaid,No attendance,Alaska
2,45-49,Female,White,"Divorced, widowed",Grade 12 (or GED),Full time,Independent living,,"Ambulatory, non-intensive outpatient",Terminated by facility,...,Methamphetamine/speed,No use in the past month,30 years and older,Other drugs only,Depressive disorders,No,,Medicaid,8-30 times in the past month,Alaska
3,35-39,Male,White,Never married,"4 years of college, university, BA/BS, some po...",Full time,Independent living,,"Ambulatory, non-intensive outpatient",Treatment completed,...,Alcohol,Some use,18-20 years,Alcohol only,Alcohol abuse,No,"Medicare, other (e.g. TRICARE, CHAMPUS)","Private insurance (Blue Cross/Blue Shield, oth...",No attendance,Alaska
4,65+,Male,Alaskan Native,"Divorced, widowed",Grade 12 (or GED),Not in labor force,Independent living,Once,"Ambulatory, non-intensive outpatient",Treatment completed,...,Marijuana/hashish,Daily use,Not known,Alcohol and other drugs,Alcohol dependence,Yes,Medicaid,Other,No attendance,Alaska


In [5]:
print(df.columns)
print(df.shape)

Index(['AGE', 'GENDER', 'RACE', 'MARITAL_STATUS', 'EDUCATION',
       'EMPLOYMENT_AT_ADMISSION', 'LIVING_ARRANGEMENT_AT_ADMISSION',
       'ARRESTS_IN_30_DAYS_PRIOR_TO_ADMISSION', 'SERVICES_AT_ADMISSION',
       'REASON_FOR_DISCHARGE', 'PRIMARY_SOURCE_OF_REFERRAL',
       'PRIOR_TREATMENT_EPISODES', 'PRIMARY_SUBSTANCE_ABUSE',
       'FREQUENCY_OF_USE', 'AGE_AT_FIRST_USE', 'ALCOHOL_OR_DRUG_ABUSE',
       'DSM_DIAGNOSIS', 'PSYCHIATRIC_PROBLEM', 'HEALTH_INSURANCE',
       'PRIMARY_PAYMENT_METHOD', 'FREQUENCY_OF_SELF_HELP_ATTENDANCE', 'STATE'],
      dtype='object')
(6441469, 22)


## Feature Engineering

In [6]:
print(df['REASON_FOR_DISCHARGE'].value_counts(), '\n')
print(df['SERVICES_AT_ADMISSION'].value_counts(), '\n')
print(df['PRIOR_TREATMENT_EPISODES'].value_counts())

Treatment completed                                     2697876
Dropped out of treatment                                1609081
Transferred to another treatment program or facility    1380826
Terminated by facility                                   367063
Other                                                    271553
Incarcerated                                              99882
Death                                                     15188
Name: REASON_FOR_DISCHARGE, dtype: int64 

Ambulatory, non-intensive outpatient                3232487
Detox, 24-hour, free-standing residential            961470
Ambulatory, intensive outpatient                     868780
Rehab/residential, short term (30 days or fewer)     666899
Rehab/residential, long term (more than 30 days)     486575
Detox, 24-hour, hospital inpatient                   158846
Ambulatory, detoxification                            50272
Rehab/residential, hospital (non-detox)               16140
Name: SERVICES_AT_ADMISSION, 

In [7]:
df = df[df['REASON_FOR_DISCHARGE'] != 'Transferred to another treatment program or facility']

In [8]:
# Create target variable. If the patient completed treatment and had no prior treatment episodes, they are considered a success. Otherwise, they are considered a failure.
df['SUCCESSFUL_TREATMENT'] = df.apply(lambda row: 1 if row['REASON_FOR_DISCHARGE'] == 'Treatment completed' and row['PRIOR_TREATMENT_EPISODES'] == "No prior treatment episode" else 0, axis=1)

print(df['SUCCESSFUL_TREATMENT'].value_counts())

0    4166388
1     894255
Name: SUCCESSFUL_TREATMENT, dtype: int64


## Modeling

### Initial evaluation/model baselines

In [9]:
df_one_hot = pd.get_dummies(df)
df_one_hot.shape

# for col in df_one_hot.columns:
#     print(col)

(5060643, 203)

In [10]:
target = df['SUCCESSFUL_TREATMENT']
features = df.drop(['REASON_FOR_DISCHARGE', 'PRIOR_TREATMENT_EPISODES', 'SUCCESSFUL_TREATMENT'], axis=1)
features_one_hot = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(features_one_hot, target, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(4048514, 193) (1012129, 193) (4048514,) (1012129,)


In [11]:
# for col in X_train.columns:
#     print(col)

In [12]:
# models_to_train = {
#     "naive_bayes": GaussianNB(),
#     "logistic_regression": LogisticRegression(max_iter=1000),
#     "random_forest": RandomForestClassifier()
# }

# for model_name, model in models_to_train.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     print(f"Accuracy score for {model_name}: {accuracy_score(y_test, y_pred)}")
#     print(f"f1 score for {model_name}: {f1_score(y_test, y_pred)}", '\n')

## Balance the data 

In [13]:
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

print(y_train_res.value_counts())

0    3334133
1    3334133
Name: SUCCESSFUL_TREATMENT, dtype: int64


In [14]:
# for model_name, model in models_to_train.items():
#     model.fit(X_train_res, y_train_res)
#     y_pred = model.predict(X_test)
#     print(f"Accuracy score for {model_name}: {accuracy_score(y_test, y_pred)}")
#     print(f"Confusion matrix: \n{confusion_matrix(y_test, y_pred)}\n")

## Tuning

In [22]:
# Define the parameter grid
param_grid = {
    'n_estimators': [500], #100, 200, 400, 
    'max_depth': [10], #5, 15, None
    'min_samples_split': [2], #2,10
    'min_samples_leaf': [2] #1,4
}

# Define model
rf = RandomForestClassifier()

# Grid
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=3)

# Fit the grid search to the data
grid_search.fit(X_train_res, y_train_res)

# Print the best parameters
print("Best parameters: ", grid_search.best_params_)

# Save the model
joblib.dump(grid_search.best_estimator_, 'best_random_forest.pkl')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


MemoryError: Unable to allocate 1.20 GiB for an array with shape (193, 6668266) and data type uint8

In [16]:
%cd ..
%cd MADS-CAPSTONE

c:\Users\ericm\OneDrive\Documents\MADS\Capstone
c:\Users\ericm\OneDrive\Documents\MADS\Capstone\MADS-CAPSTONE


In [17]:
with open('best_random_forest.pkl', 'rb') as file:
    model = load(file)

y_pred = model.predict(X_test)
print(f"Accuracy score for random forest: {accuracy_score(y_test, y_pred)}")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Feature names unseen at fit time:
- AGE_AT_FIRST_USE_Not known
- ALCOHOL_OR_DRUG_ABUSE_None
- ARRESTS_IN_30_DAYS_PRIOR_TO_ADMISSION_Not known
- DSM_DIAGNOSIS_Not known
- EDUCATION_Not known
- ...
Feature names seen at fit time, yet now missing:
- ARRESTS_IN_30_DAYS_PRIOR_TO_DISCHARGE_None
- ARRESTS_IN_30_DAYS_PRIOR_TO_DISCHARGE_Once
- ARRESTS_IN_30_DAYS_PRIOR_TO_DISCHARGE_Two or more times
- EMPLOYMENT_AT_DISCHARGE_Full time
- EMPLOYMENT_AT_DISCHARGE_Not in labor force
- ...



ValueError: X has 193 features, but RandomForestClassifier is expecting 215 features as input.

In [18]:
print(model.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
