In [12]:
# Import Required Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


Unnamed: 0,DISYR,CASEID,STFIPS,SERVICES,LOS,NOPRIOR,SERVICES_D,REASON,AGE,ALCFLG,...,BARBFLG_Recoded,SEDHPFLG_Recoded,INHFLG_Recoded,OTCFLG_Recoded,OTHERFLG_Recoded,DIVISION_Recoded,ALCDRUG_Recoded,Completion_Status,Length_of_Stay,Age_Group
0,2020,1243074,2,7,35,0,7,3,6,1,...,Substance not reported,Substance not reported,Substance not reported,Substance not reported,Substance not reported,Pacific,Alcohol and other drugs,Incomplete,121-180 days,30–34 years
1,2020,1168758,2,7,29,0,7,1,6,1,...,Substance not reported,Substance not reported,Substance not reported,Substance not reported,Substance not reported,Pacific,Alcohol only,Completed,1-30 days,30–34 years
2,2020,1150846,2,7,36,0,7,7,1,1,...,Substance not reported,Substance not reported,Substance not reported,Substance not reported,Substance not reported,Pacific,Alcohol and other drugs,Incomplete,181-365 days,12–14 years
3,2020,1098248,2,7,35,0,7,3,8,1,...,Substance not reported,Substance not reported,Substance not reported,Substance not reported,Substance reported,Pacific,Alcohol and other drugs,Incomplete,121-180 days,40–44 years
4,2020,1193448,2,7,35,0,7,3,7,0,...,Substance not reported,Substance not reported,Substance not reported,Substance not reported,Substance not reported,Pacific,Other drugs only,Incomplete,121-180 days,35–39 years


In [None]:
# Reading clean file

teds_no_prior_state_pop_complete.eda = pd.read_csv("teds_no_prior_state_pop_complete.eda.csv")
teds_no_prior_state_pop_complete.eda.head()

In [6]:
teds_no_prior_state_pop_complete.eda.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503107 entries, 0 to 503106
Data columns (total 61 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   DISYR                       503107 non-null  int64  
 1   CASEID                      503107 non-null  int64  
 2   STFIPS                      503107 non-null  int64  
 3   SERVICES                    503107 non-null  int64  
 4   LOS                         503107 non-null  int64  
 5   NOPRIOR                     503107 non-null  int64  
 6   SERVICES_D                  503107 non-null  int64  
 7   REASON                      503107 non-null  int64  
 8   AGE                         503107 non-null  int64  
 9   ALCFLG                      503107 non-null  int64  
 10  COKEFLG                     503107 non-null  int64  
 11  MARFLG                      503107 non-null  int64  
 12  HERFLG                      503107 non-null  int64  
 13  METHFLG       

In [8]:
teds_no_prior_state_pop_complete.eda.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503107 entries, 0 to 503106
Data columns (total 62 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   DISYR                       503107 non-null  int64  
 1   CASEID                      503107 non-null  int64  
 2   STFIPS                      503107 non-null  int64  
 3   SERVICES                    503107 non-null  int64  
 4   LOS                         503107 non-null  int64  
 5   NOPRIOR                     503107 non-null  int64  
 6   SERVICES_D                  503107 non-null  int64  
 7   REASON                      503107 non-null  int64  
 8   AGE                         503107 non-null  int64  
 9   ALCFLG                      503107 non-null  int64  
 10  COKEFLG                     503107 non-null  int64  
 11  MARFLG                      503107 non-null  int64  
 12  HERFLG                      503107 non-null  int64  
 13  METHFLG       

In [7]:
# Calculate the population per square mile
teds_no_prior_state_pop_complete.eda['pop_per_sq_mil'] = teds_no_prior_state_pop_complete.eda['state_population'] / teds_no_prior_state_pop_complete.eda['state_area_sq_miles']


In [9]:
# Define a list of column names you want to keep in teds_final
columns_to_keep = [
    'state', 'total_treatment_facilities', 'SERVICES_Recoded', 'SERVICES_D_Recoded',
    'ALCFLG_Recoded', 'COKEFLG_Recoded', 'MARFLG_Recoded', 'HERFLG_Recoded',
    'METHFLG_Recoded', 'OPSYNFLG_Recoded', 'PCPFLG_Recoded', 'HALLFLG_Recoded',
    'MTHAMFLG_Recoded', 'AMPHFLG_Recoded', 'STIMFLG_Recoded', 'BENZFLG_Recoded',
    'TRNQFLG_Recoded', 'BARBFLG_Recoded', 'SEDHPFLG_Recoded', 'INHFLG_Recoded',
    'OTCFLG_Recoded', 'OTHERFLG_Recoded', 'DIVISION_Recoded', 'ALCDRUG_Recoded',
    'Completion_Status', 'Length_of_Stay', 'Age_Group', 'pop_per_sq_mil'
]

teds_final = teds_no_prior_state_pop_complete.eda[columns_to_keep]


In [13]:
# Creating dummy variables for categorical columns
categorical_columns = [
    'state', 'SERVICES_Recoded', 'SERVICES_D_Recoded', 'ALCFLG_Recoded',
    'COKEFLG_Recoded', 'MARFLG_Recoded', 'HERFLG_Recoded', 'METHFLG_Recoded',
    'OPSYNFLG_Recoded', 'PCPFLG_Recoded', 'HALLFLG_Recoded', 'MTHAMFLG_Recoded',
    'AMPHFLG_Recoded', 'STIMFLG_Recoded', 'BENZFLG_Recoded', 'TRNQFLG_Recoded',
    'BARBFLG_Recoded', 'SEDHPFLG_Recoded', 'INHFLG_Recoded', 'OTCFLG_Recoded',
    'OTHERFLG_Recoded', 'DIVISION_Recoded', 'ALCDRUG_Recoded',
    'Length_of_Stay', 'Age_Group'
]

# Using the get_dummies function to create dummy variables
teds_final_with_dummies = pd.get_dummies(teds_final, columns=categorical_columns, drop_first=True)

# Standardizing numeric columns using a scaler
numeric_columns = ['total_treatment_facilities', 'pop_per_sq_mil']

# Initializing the StandardScaler
scaler = StandardScaler()

# Fitting and transform the numeric columns
teds_final_with_dummies[numeric_columns] = scaler.fit_transform(teds_final_with_dummies[numeric_columns])


In [28]:
teds_final_with_dummies.head()

Unnamed: 0,total_treatment_facilities,Completion_Status,pop_per_sq_mil,state_Alaska,state_Arizona,state_Arkansas,state_California,state_Colorado,state_Connecticut,state_District of Columbia,...,Age_Group_18–20 years,Age_Group_21–24 years,Age_Group_25–29 years,Age_Group_30–34 years,Age_Group_35–39 years,Age_Group_40–44 years,Age_Group_45–49 years,Age_Group_50–54 years,Age_Group_55–64 years,Age_Group_65–95 years
0,-1.09923,0,-0.558452,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,-1.09923,1,-0.558452,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,-1.09923,0,-0.558452,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-1.09923,0,-0.558452,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,-1.09923,0,-0.558452,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [26]:
# Initializing the LabelEncoder
label_encoder = LabelEncoder()

# Encode the 'Completion_Status' column
teds_final_with_dummies['Completion_Status'] = label_encoder.fit_transform(teds_final_with_dummies['Completion_Status'])

# Swapping the labels
teds_final_with_dummies['Completion_Status'] = 1 - teds_final_with_dummies['Completion_Status']

#'Incomplete' will be encoded as 1 and 'Complete' as 0


In [29]:
# Defining features (X) and target variable (y)
X = teds_final_with_dummies.drop(columns=['Completion_Status'])
y = teds_final_with_dummies['Completion_Status']

# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
