In [846]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [847]:
# Problem Statement: To predict early re-admission <30 days when HbA1C is tested

dataset = 'diabetic_data_initial.csv'
df = pd.read_csv(dataset)
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [848]:
# total rows in dataframe
total_rows = len(df)
total_rows

101766

In [849]:
# check if any columns with null values
df.isnull().sum()

encounter_id                0
patient_nbr                 0
race                        0
gender                      0
age                         0
weight                      0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
payer_code                  0
medical_specialty           0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
diag_2                      0
diag_3                      0
number_diagnoses            0
max_glu_serum               0
A1Cresult                   0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazo

In [850]:
num_of_rows_with_question_mark = df.loc[(df["weight"] == "?")]

In [851]:
# percent of data in weight column with ? mark
(len(num_of_rows_with_question_mark)/total_rows)*100

96.85847925633315

In [852]:
# drop weight and payer_code columns
df = df.drop(columns=['weight', 'payer_code'])

In [853]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,?,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,?,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,?,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,?,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [854]:
# rows containing admission_type_id 5 or 6, which correspond to "Not Available" or "NULL" respectively
rows_to_be_dropped = len (df.loc[(df["admission_type_id"] == 5) | (df["admission_type_id"] == 6), :])

In [855]:
# percent of rows to be dropped
(rows_to_be_dropped/total_rows)*100

9.90114576577639

In [856]:
# drop rows with admission_type_id 5 or 6
df.drop(df[(df['admission_type_id'] == 5) | (df["admission_type_id"] == 6)].index, inplace = True) 
len(df)

91690

In [857]:
# drop the data where patient expired
df.drop(df[df["discharge_disposition_id"] == 11].index, inplace = True)

In [858]:
# replace medical_speciality with "?" to "missing"
df['medical_specialty'] = df['medical_specialty'].replace({'?': 'missing'})
df['race'] = df['race'].replace({'?': 'missing'})
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,missing,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,missing,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,missing,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,missing,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),2,1,2,3,missing,...,No,Steady,No,No,No,No,No,No,Yes,>30


In [859]:
# drop diagnosis with "?"
df.drop(df[(df["diag_1"] == "?") | (df["diag_2"] == "?") | (df["diag_3"] == "?")].index, inplace = True)

In [860]:
# Retain only the data where patient was in the hospital for <= 14 days
df_filtered = df[df['time_in_hospital'] <= 14]
df_filtered.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,missing,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,missing,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,missing,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,missing,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),2,1,2,3,missing,...,No,Steady,No,No,No,No,No,No,Yes,>30


In [861]:
len(df_filtered)

88844

In [862]:
# Generate our categorical variable list
cat_list = df_filtered.dtypes[df_filtered.dtypes == "object"].index.tolist()
cat_list

['race',
 'gender',
 'age',
 'medical_specialty',
 'diag_1',
 'diag_2',
 'diag_3',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed',
 'readmitted']

In [863]:
df_filtered['race'].value_counts()

Caucasian          65803
AfricanAmerican    17533
missing             2037
Hispanic            1611
Other               1310
Asian                550
Name: race, dtype: int64

In [864]:
df_filtered['race'] = df_filtered['race'].replace({'missing': 'Other'})

In [865]:
df_filtered['race'].value_counts()

Caucasian          65803
AfricanAmerican    17533
Other               3347
Hispanic            1611
Asian                550
Name: race, dtype: int64

In [866]:
df_filtered['gender'].value_counts()

Female             47897
Male               40944
Unknown/Invalid        3
Name: gender, dtype: int64

In [867]:
df_filtered.drop(df_filtered[(df_filtered['gender'] == "Unknown/Invalid")].index, inplace = True)
df_filtered['gender'].value_counts()

Female    47897
Male      40944
Name: gender, dtype: int64

In [868]:
df_filtered['readmitted'].value_counts()

NO     47395
>30    31271
<30    10175
Name: readmitted, dtype: int64

In [869]:
df_filtered['readmitted'] = df_filtered['readmitted'].replace({'>30': 'NO'})

In [870]:
df_filtered['readmitted'].value_counts()

NO     78666
<30    10175
Name: readmitted, dtype: int64

In [871]:
df_filtered['medical_specialty'].value_counts()

missing                   44391
InternalMedicine          12696
Emergency/Trauma           7366
Family/GeneralPractice     5736
Cardiology                 4330
                          ...  
Perinatology                  1
Proctology                    1
Psychiatry-Addictive          1
Dermatology                   1
Neurophysiology               1
Name: medical_specialty, Length: 71, dtype: int64

In [872]:
df_filtered['diag_1'].value_counts()

428    6098
414    6016
786    3507
410    3164
486    3013
       ... 
919       1
V67       1
98        1
698       1
V26       1
Name: diag_1, Length: 709, dtype: int64

In [873]:
df_filtered['change'].value_counts()

No    47482
Ch    41359
Name: change, dtype: int64

In [874]:
import re

# function to clean 'age' column
def parse_age_range(age_col):
    c=[]
    for values in age_col:
        s = re.sub('[[)]','', values)
        c.append(s)
    return c

In [875]:
# replace 'age' values with cleaned values
df_filtered['age'] = parse_age_range(df_filtered['age'].values)

In [876]:
df_filtered.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,149190,55629189,Caucasian,Female,10-20,1,1,7,3,missing,...,No,Up,No,No,No,No,No,Ch,Yes,NO
2,64410,86047875,AfricanAmerican,Female,20-30,1,1,7,2,missing,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,30-40,1,1,7,2,missing,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,40-50,1,1,7,1,missing,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,50-60,2,1,2,3,missing,...,No,Steady,No,No,No,No,No,No,Yes,NO


In [877]:
df_filtered.to_csv("filtered_data.csv")

In [878]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [879]:
# function to apply label encoding
def my_encoder(cols):
    for c in cols:
        le.fit(df_filtered[c])
        df_filtered[c] = le.transform(df_filtered[c])

In [880]:
my_encoder(['race',
 'gender',
 'age',
 'medical_specialty',
 'diag_1',
 'diag_2',
 'diag_3',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed',
 'readmitted'])

In [881]:
encoded_df = df_filtered
encoded_df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,149190,55629189,2,0,1,1,1,7,3,70,...,0,3,1,0,0,0,0,0,1,1
2,64410,86047875,0,0,2,1,1,7,2,70,...,0,1,1,0,0,0,0,1,1,1
3,500364,82442376,2,1,3,1,1,7,2,70,...,0,3,1,0,0,0,0,0,1,1
4,16680,42519267,2,1,4,1,1,7,1,70,...,0,2,1,0,0,0,0,0,1,1
5,35754,82637451,2,1,5,2,1,2,3,70,...,0,2,1,0,0,0,0,1,1,1


In [882]:
encoded_df.to_csv("encoded_data.csv")

In [883]:
y = encoded_df["readmitted"]
X = encoded_df.drop(columns=['readmitted'])

In [884]:
from sklearn.model_selection import train_test_split

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [885]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=1)

In [886]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [887]:
y_pred = classifier.predict(X_test)

In [888]:
d = {'Predicted': y_pred, 'Actual': y_test}
check_df = pd.DataFrame(data=d)
check_df.head()

Unnamed: 0,Predicted,Actual
50234,1,1
90947,1,1
25376,1,1
66248,1,1
26206,1,1


In [889]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.885462158389987


In [890]:
# Preprocess numerical data for neural network

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [891]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.885


In [798]:
# # Define the basic neural network model
# nn_model = tf.keras.models.Sequential()
# nn_model.add(tf.keras.layers.Dense(units=90, activation="relu", input_dim=45))
# nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# # Compile the Sequential model together and customize metrics
# nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# # Train the model
# fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

# # Evaluate the model using the test data
# model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")