<a href="https://colab.research.google.com/github/markaljm/Diabetes-130-US-hospitals-for-years-1999-2008-Data-Set/blob/main/Diabetes_Readmission_Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Libraries**

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

# Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt

# **Import Data**

In [None]:
data = pd.read_csv("./diabetic_data.csv")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data.shape

In [None]:
data.head()

# **Data  Cleaning**

In [None]:
## Correlation

#get correlations of each features in dataset
corrmat = data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlBu")

In [None]:
for i in data.columns:
    print(i, data[data[i] == '?'].shape[0])

In [None]:
data['readmitted'].value_counts()

In [None]:
ax = sns.barplot(x=data['readmitted'].value_counts().index,   y=data['readmitted'].value_counts())
plt.xlabel('labels', size = 12)
plt.ylabel('# of Readmitted', size = 12)
plt.title('Class Distribution \n', size = 12)
plt.show()

In [None]:
#Created Another label to map <30 and >30 to 1 class for better Analysis and Classification.
data['readmitted'].unique()

In [None]:
# Created another column and take it as 2 class problem, Label the <30 and >30 as YES and Other "N0" as No.

def check_label(text):
    if text == '>30' or text =='<30':
        return 'Yes'
    else:
        return 'No'
    
data['readmitted_2'] =data['readmitted'].apply(check_label) 

In [None]:
ax = sns.countplot(x='readmitted_2',   data= data)
plt.xlabel('Readmitted', size = 12)
plt.xticks(rotation=90, size = 12)
plt.ylabel('Count', size = 12)
plt.title('Distribution of Readmission Class  \n\n', size = 12)
plt.show()

In [None]:
data.loc[data['race'] == '?', 'race'] = 'Other'

In [None]:
ax = sns.barplot(x=data['race'].value_counts().index,   y=data['race'].value_counts())
plt.xlabel('Race', size = 12)
plt.xticks(rotation=90, size = 12)
plt.ylabel('Count', size = 12)
plt.title('Distribution of Race of Patients \n', size = 12)
plt.show()

In [None]:
data['gender'].value_counts()

In [None]:
# Drop the "Unknown/Invalid" gender of the data.
data.drop(data[data['gender'] == 'Unknown/Invalid'].index, inplace = True)

In [None]:
data.reset_index(inplace = True, drop = True)

In [None]:
data['gender'].value_counts()

In [None]:
data.head()

# **Imputed Weight**

In [None]:
data = pd.read_csv("./diabetic_data.csv", na_values="?")

In [None]:
# Split the data into independent and dependent variables
X = data[["race", "gender"]]
y = data["weight"]

In [None]:
# Create a mapping of the non-numeric values to numerical values
weights_map = {"[0-25)": 10, "[25-50)": 37.5, "[50-75)": 62.5, "[75-100)": 87.5}

In [None]:
# Replace the non-numeric values with their corresponding numerical values
y = y.map(weights_map)

In [None]:
# Impute missing values in the dependent variable using the mean of the column
imputer = SimpleImputer(strategy="mean")
y_imputed = imputer.fit_transform(y.values.reshape(-1, 1))

In [None]:
# Apply one-hot encoding to the independent variables
ct = ColumnTransformer(
    transformers=[
        ("encoder", OneHotEncoder(), ["race", "gender"])
    ],
    remainder="passthrough"
)
X_encoded = ct.fit_transform(X)

In [None]:
# Train a linear regression model to predict the weights based on race and gender
regressor = LinearRegression()
regressor.fit(X_encoded, y_imputed)

In [None]:
# Use the trained model to impute the missing weights
weights_imputed = regressor.predict(X_encoded)

In [None]:
# Add the imputed weights back to the DataFrame
data["weights_imputed"] = weights_imputed.flatten()

In [None]:
# Add a new column to the DataFrame to record the imputation method
data["imputation_method"] = "Weight Imputation - Linear Regression with One-Hot Encoding"


In [None]:
# Save the imputed data to a new CSV file
data.to_csv("imputed_data.csv", index=False)

In [None]:
# Print a message to confirm that the imputation was successful
print("Weight imputation successful. Imputed data saved to imputed_data.csv.")

In [None]:
df = pd.read_csv("./imputed_data.csv")

In [None]:
df.shape

In [None]:
df['readmitted'].value_counts()

In [None]:
ax = sns.barplot(x=df['readmitted'].value_counts().index,   y=df['readmitted'].value_counts())
plt.xlabel('labels', size = 12)
plt.ylabel('# of Readmitted', size = 12)
plt.title('Class Distribution \n', size = 12)
plt.show()

In [None]:
df['readmitted'].unique()

In [None]:
# Created another column and take it as 2 class problem, Label the <30 and >30 as YES and Other "N0" as No.

def check_label(text):
    if text == '>30' or text =='<30':
        return 'Yes'
    else:
        return 'No'
    
df['readmitted_2'] =df['readmitted'].apply(check_label) 

In [None]:
ax = sns.countplot(x='readmitted_2',   data= df)
plt.xlabel('Readmitted', size = 12)
plt.xticks(rotation=90, size = 12)
plt.ylabel('Count', size = 12)
plt.title('Distribution of Readmission Class  \n\n', size = 12)
plt.show()

In [None]:
# Lets drop this column. 
df.drop(columns = ['weight'], inplace = True)

In [None]:
df['payer_code'].value_counts()

In [None]:
df.drop(columns =['medical_specialty'], inplace = True)

In [None]:
df.drop(columns = ['acetohexamide','imputation_method', 'payer_code','tolbutamide', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
                   'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone',
                   'metformin-pioglitazone'], inplace = True)

In [None]:
df.shape

In [None]:
df = df[~((df['diag_1'] == "?") | (df['diag_2'] == "?") | (df['diag_3'] == "?"))]

In [None]:
df.shape

# Transform Categorical Features

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()

In [None]:
categorical_features =['race', "weights_imputed",'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin',
       'glyburide-metformin', 'change', 'diabetesMed'] 

for i in categorical_features:
    df[i] = le.fit_transform(df[i])

In [None]:
df.head()

Transform Label Columns

In [None]:
label = le.fit(df['readmitted_2'])

In [None]:
df['readmitted_2_encoded'] = label.transform(df['readmitted_2'])  #After Label Encoding the values assigned to class values are O:No Yes:1

In [None]:
df= df.drop(columns= ['encounter_id', 'patient_nbr', 'readmitted','readmitted_2']) #Feature correaltion to drop

In [None]:
df

In [None]:
df.columns

In [None]:
#Split dependent and independant parameters 
X = df.drop(columns= ['readmitted_2_encoded'])
Y = df['readmitted_2_encoded']

In [None]:
# Feature scaling
from sklearn import preprocessing
scaled_X = preprocessing.StandardScaler().fit_transform(X)

In [None]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_X, Y, test_size=0.20, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Predict the test set results
y_pred = classifier.predict(X_test)

In [None]:
# Evaluate the model performance
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 450, max_depth=9, random_state=43)
rf.fit(X_train, y_train)

In [None]:
rf_prediction =  rf.predict(X_test)

In [None]:
print(classification_report(y_test, rf_prediction, target_names= ['Not Readmitted', 'Readmitted']))