In [68]:
# Import Dependancies
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [69]:
# Load data
file_path = "clean_data.csv"
df2 = pd.read_csv(file_path)
df2.head()

Unnamed: 0,id,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,...,metformin,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,1,Caucasian,Female,5,6,25,1,1,41,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Caucasian,Female,15,1,1,7,3,59,0,...,0,0,0,0,0,0,3,1,1,1
2,3,AfricanAmerican,Female,25,1,1,7,2,11,5,...,0,0,2,0,0,0,0,0,1,0
3,4,Caucasian,Male,35,1,1,7,2,44,1,...,0,0,0,0,0,0,3,1,1,0
4,5,Caucasian,Male,45,1,1,7,1,51,0,...,0,0,2,0,0,0,2,1,1,0


In [70]:
#Encoding the Data
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse = False)

#Encode Column Race
encode_df = pd.DataFrame(enc.fit_transform(df2.race.values.reshape(-1,1)))

#Rename encoded columns
encode_df.columns = enc.get_feature_names(['race'])
encode_df.head()

# Merge Dataframes
df2 = df2.merge(encode_df,left_index=True,right_index=True)



In [71]:
# Drop Encoded Columns and ID Column

df2.drop(columns = ['race','gender','id'], inplace = True)

In [72]:
# Classify X and y Data

y = df2['readmitted']
X = df2.drop(columns = 'readmitted')

In [73]:
# Create testing and training set

X_train,X_test,y_train,y_test = train_test_split(X,y, train_size = 0.8, random_state = 6, stratify = y)

In [74]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

In [75]:
# Create a random forest classifier.
model = RandomForestClassifier(n_estimators=128, random_state=6) 
# Fitting the model.
model = model.fit(X_resampled, y_resampled)

In [None]:
# Making predictions using the testing data.
y_pred = model.predict(X_test)

In [None]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, y_pred)
acc_score

0.37689065279292855

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1",'Actual 2'], columns=["Predicted 0", "Predicted 1",'Predicted 2'])

cm_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,7231,1226,140
Actual 1,3227,1129,92
Actual 2,934,280,45


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.84      0.72      8597
           1       0.43      0.25      0.32      4448
           2       0.16      0.04      0.06      1259

    accuracy                           0.59     14304
   macro avg       0.41      0.38      0.37     14304
weighted avg       0.53      0.59      0.54     14304

