In [76]:
# Importing Necessary Libraries
import pandas as pd
import numpy as np
from google.colab import files
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE,SMOTENC

# Importing the ML algorithms
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Importing accuracy metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [77]:
# Data Reading
Original_df=pd.read_csv("diabetic_data_uci.csv")

In [78]:
Copy_df = Original_df.copy()

In [79]:
Copy_df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [80]:
# Code block to analyze the improtance of features
print(Copy_df.describe())

# Printing the data types and missing values
print(Copy_df.info())

# Getting the unique values for each column
print("\nUnique Values in Each Column:")
for col in Copy_df.columns:
    print(f"\nColumn: {col}")
    print(Copy_df[col].value_counts())

       encounter_id   patient_nbr  admission_type_id  \
count  1.017660e+05  1.017660e+05      101766.000000   
mean   1.652016e+08  5.433040e+07           2.024006   
std    1.026403e+08  3.869636e+07           1.445403   
min    1.252200e+04  1.350000e+02           1.000000   
25%    8.496119e+07  2.341322e+07           1.000000   
50%    1.523890e+08  4.550514e+07           1.000000   
75%    2.302709e+08  8.754595e+07           3.000000   
max    4.438672e+08  1.895026e+08           8.000000   

       discharge_disposition_id  admission_source_id  time_in_hospital  \
count             101766.000000        101766.000000     101766.000000   
mean                   3.715642             5.754437          4.395987   
std                    5.280166             4.064081          2.985108   
min                    1.000000             1.000000          1.000000   
25%                    1.000000             1.000000          2.000000   
50%                    1.000000             7.00000




1.   **Gender has Unknown/Invalid values as per our summary**
2.   **Missing values have been represented with ? which needs to be processed**


In [81]:
Copy_df.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         



*   **The categorical values will have to be encoded for the model**
*   **Handling the null values by coverting them to NaN and then checking columns for more than required (30%) null values**



In [82]:
# replacing ? and invalid values with with Nan
Copy_df = Copy_df.replace(["?", "Unknown/Invalid"], np.nan)
# Checking for % of null values
(Copy_df.isna().sum()*100/len(Copy_df))[(Copy_df.isna().sum()*100/len(Copy_df))>30]

weight               96.858479
payer_code           39.557416
medical_specialty    49.082208
dtype: float64

**Dropping the columns Weight, Payer_Code and Medical_Speciality**



In [83]:
Copy_df=Copy_df.drop(['weight','payer_code','medical_specialty'],axis=1)

**Checking the percentage of null values**

In [84]:
null_percentage = (Copy_df.isna().sum() * 100 / len(Copy_df))
null_percentage = null_percentage[null_percentage > 0]
print(null_percentage)

race      2.233555
gender    0.002948
diag_1    0.020636
diag_2    0.351787
diag_3    1.398306
dtype: float64



*   All the columns do not have more than 3% missing values
*   There are only two options either imputing them or dropping them. Imputing however is not recommended in the healthcare setting. thus if the percentage of loss of dropping is less, we can decide to drop them
*   For, this we have to find the % loss for rows to see the impact of dropping them








In [85]:
(Copy_df.shape[0]-Copy_df.dropna(axis=0).shape[0])*100/Copy_df.dropna(axis=0).shape[0]

3.7877860726961203

**Since the loss is just 3% we can proceed with dropping these rows without losing too much information**

In [86]:
Copy_df.dropna(axis=0,inplace=True)

In [87]:
Copy_df.isna().sum()

encounter_id                0
patient_nbr                 0
race                        0
gender                      0
age                         0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
diag_2                      0
diag_3                      0
number_diagnoses            0
max_glu_serum               0
A1Cresult                   0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
acarbose                    0
miglitol  

**Now there are no missing values in the dataset**

In [88]:
# Checking the distribution of the categorical variables 'examide' and 'citoglipton'
print(Copy_df['examide'].value_counts())
print(Copy_df['citoglipton'].value_counts())

# Since within the boundaries of our dataset both 'examide' and 'citoglipton' have only one category we will be dropping these columns as they don't add much value for prediction
Copy_df = Copy_df.drop(columns=['examide', 'citoglipton'])

No    98052
Name: examide, dtype: int64
No    98052
Name: citoglipton, dtype: int64


In [89]:
# Printing the df to verify
Copy_df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,59,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,11,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,44,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,51,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),2,1,2,3,31,...,No,Steady,No,No,No,No,No,No,Yes,>30


Now we have the basic processed data to use for our model

In [90]:
Copy_df.to_csv('clean_data.csv', index=False)
files.download('clean_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Coming to the data dictionary,**
*   **The categories within 'admission type', 'discharge disposition', and 'admission source' are similar, so by merging these categories, we can decrease the number of columns created during the encoding process.**
*   **For the 'admission_type' column, similar categories are remapped as follows: 'Urgent' (2) and 'Trauma Center' (7) are combined into the 'Emergency' (1) category. Additionally, 'NULL' (6) and 'Not Mapped' (8) are consolidated into the 'Not Available' (5) category.**
*   **For the 'discharge_disposition' column, similar categories are remapped: e.g. 'Discharged/transferred to home with home health service' (6), 'Discharged/transferred to home under care of Home IV provider' (8) and 'Hospice / home' (13) are combined into the 'Discharged to home' (1) category.**
*   **For the 'admission_source' column, similar categories are remapped: e.g. 'Clinic Referral' (2) and 'HMO Referral' (3) are combined into the ' Physician Referral' (1) category.**

In [91]:
Copy_df = pd.read_csv('clean_data.csv')

In [92]:
def remap_admission_type(admission_type_id):
    if admission_type_id in [2, 7]:
        return 1  # Urgent and Trauma Center is mapped to Emergency
    elif admission_type_id in [6, 8]:
        return 5  # NULL and Not Mapped are mapped to Not Available
    else:
        return admission_type_id

Copy_df['admission_type_id'] = Copy_df['admission_type_id'].apply(remap_admission_type)


In [93]:
def remap_discharge_disposition(discharge_disposition_id):
    if discharge_disposition_id in [6, 8, 13]:
        return 1  # Merging categories into 1
    elif discharge_disposition_id in [9, 12, 15, 16, 17]:
        return 5  # Merging categories into 5
    elif discharge_disposition_id in [3, 4, 5, 14, 22, 23, 24]:
        return 2  # Merging categories into 2
    elif discharge_disposition_id in [25, 26]:
        return 18  # Merging categories into 18
    else:
        return discharge_disposition_id

Copy_df['discharge_disposition_id'] = Copy_df['discharge_disposition_id'].apply(remap_discharge_disposition)


In [94]:
def remap_admission_source(admission_source_id):
    if admission_source_id in [2, 3]:
        return 1  # Merging categories into 1
    elif admission_source_id in [5, 6, 10, 22, 25]:
        return 4  # Merging categories into 4
    elif admission_source_id in [15, 17, 20, 21]:
        return 9  # Merging categories into 9
    elif admission_source_id in [13, 14]:
        return 11  # Merging categories into 11
    else:
        return admission_source_id

Copy_df['admission_source_id'] = Copy_df['admission_source_id'].apply(remap_admission_source)


**Encoding of variables**

In [95]:
# Using replace function to combine categories and encode them to numerical values
Copy_df['change'] = Copy_df['change'].replace({'Ch': 1, 'No': 0})
Copy_df['gender'] = Copy_df['gender'].replace({'Male': 1, 'Female': 0})
Copy_df['diabetesMed'] = Copy_df['diabetesMed'].replace({'Yes': 1, 'No': 0})

Ecoding to combine categories

In [96]:
medicine_columns = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide', 'metformin-pioglitazone','metformin-rosiglitazone', 'glimepiride-pioglitazone', 'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide']
Copy_df[medicine_columns] = Copy_df[medicine_columns].replace({'No': 0, 'Steady': 1, 'Up': 1, 'Down': 1})

**Revising the variables in test result columns such as 'A1cresult' and 'max_glu_serum', where the 'None' value is assigned a negative numerical representation for differentiation purposes.**

In [97]:
Copy_df['A1Cresult'] = Copy_df['A1Cresult'].replace({'>7': 1, '>8': 1, 'Norm': 0, 'None': -100})
Copy_df['max_glu_serum'] = Copy_df['max_glu_serum'].replace({'>200': 1, '>300': 1, 'Norm': 0, 'None': -100})

**Age is given as a range. Thus taking midpoints to replace it as a single value to see patterns**



In [98]:
Copy_df.age.value_counts()

[70-80)     25305
[60-70)     21809
[80-90)     16702
[50-60)     16697
[40-50)      9265
[30-40)      3548
[90-100)     2717
[20-30)      1478
[10-20)       466
[0-10)         65
Name: age, dtype: int64

In [99]:
age_midpoints = {
    '[0-10)': 5,
    '[10-20)': 15,
    '[20-30)': 25,
    '[30-40)': 35,
    '[40-50)': 45,
    '[50-60)': 55,
    '[60-70)': 65,
    '[70-80)': 75,
    '[80-90)': 85,
    '[90-100)': 95
}
Copy_df['age'] = Copy_df['age'].replace(age_midpoints)


**Finally, we encode the target variable as we are looking to deal with a binary classification problem. thus we will consider just two categories i.e., <= 30 days is readmitted while >30 and NO are not readmitted**

In [100]:
# Encoding the Target Variable using a lambda function
Copy_df['readmitted'] = Copy_df['readmitted'].apply(lambda x: 0 if x in ['>30', 'NO'] else 1)

**Processing the diagnosis variable by grouping them into specific diagnostic categories, thereby streamlining the number of variables produced for each diagnostic field.**

In [101]:
# Use the next block as another way
'''
def categorize_diagnosis(x):
    x = str(x)
    if "V" in x or "E" in x:
        return "Other"
    elif "250" in x:
        return "Diabetes"
    try:
        x = int(float(x))
        if 390 <= x <= 459 or x == 785:
            return "Circulatory"
        elif 460 <= x <= 519 or x == 786:
            return "Respiratory"
        elif 520 <= x <= 579 or x == 787:
            return "Digestive"
        elif 580 <= x <= 629 or x == 788:
            return "Genitourinary"
        elif 140 <= x <= 239:
            return "Neoplasms"
        elif 710 <= x <= 739:
            return "Musculoskeletal"
        elif 800 <= x <= 999:
            return "Injury"
    except ValueError:
        pass
    return "Other"

# Apply the categorize_diagnosis function to the 'diag_1' column
Copy_df['diag_cat'] = Copy_df['diag_1'].apply(categorize_diagnosis)

# Drop the 'diag_2' and 'diag_3' columns
Copy_df = Copy_df.drop(['diag_2', 'diag_3'], axis=1)
'''

'\ndef categorize_diagnosis(x):\n    x = str(x)\n    if "V" in x or "E" in x:\n        return "Other"\n    elif "250" in x:\n        return "Diabetes"\n    try:\n        x = int(float(x))\n        if 390 <= x <= 459 or x == 785:\n            return "Circulatory"\n        elif 460 <= x <= 519 or x == 786:\n            return "Respiratory"\n        elif 520 <= x <= 579 or x == 787:\n            return "Digestive"\n        elif 580 <= x <= 629 or x == 788:\n            return "Genitourinary"\n        elif 140 <= x <= 239:\n            return "Neoplasms"\n        elif 710 <= x <= 739:\n            return "Musculoskeletal"\n        elif 800 <= x <= 999:\n            return "Injury"\n    except ValueError:\n        pass\n    return "Other"\n\n# Apply the categorize_diagnosis function to the \'diag_1\' column\nCopy_df[\'diag_cat\'] = Copy_df[\'diag_1\'].apply(categorize_diagnosis)\n\n# Drop the \'diag_2\' and \'diag_3\' columns\nCopy_df = Copy_df.drop([\'diag_2\', \'diag_3\'], axis=1)\n'

In [102]:
# Define the function to check for diabetes in three columns of a DataFrame
def combined_diagnosis(row):
    # Define the inner function to check a single diagnosis code
    def is_diabetes(code):
        code = str(code)
        return '250' in code  # Assuming '250' is the code for diabetes

    # Check each of the three columns for diabetes
    for col in ['diag_1', 'diag_2', 'diag_3']:
        if is_diabetes(row[col]):
            return 'Diabetes'

    # If none of the columns contain '250', return 'Other'
    return 'Other'

# Apply the function to each row of the DataFrame
Copy_df['diag_cat'] = Copy_df.apply(combined_diagnosis, axis=1)
Copy_df = Copy_df.drop(['diag_1', 'diag_2', 'diag_3'], axis=1)

**For the final preprocessed data, we will do One Hot Encoding**

In [103]:
# Using get_dummies to generate encoded columns for specified categories in Copy_df
Copy_df = pd.get_dummies(Copy_df, columns=['gender', 'admission_type_id', 'discharge_disposition_id',
                                           'admission_source_id', 'max_glu_serum', 'A1Cresult', 'diag_cat'], drop_first=True)

# Creating dummy variables for 'race' and concatenate them with Copy_df
race_dummies = pd.get_dummies(Copy_df['race'])
Copy_df = pd.concat([Copy_df, race_dummies], axis=1)
Copy_df.drop(['race'], inplace=True, axis=1)

# Displaying the first few rows
print(Copy_df.head())

# Dropping duplicates based on 'patient_nbr', keeping the first occurrence
Copy_df = Copy_df.drop_duplicates(subset='patient_nbr', keep='first')


   encounter_id  patient_nbr  age  time_in_hospital  num_lab_procedures  \
0        149190     55629189   15                 3                  59   
1         64410     86047875   25                 2                  11   
2        500364     82442376   35                 2                  44   
3         16680     42519267   45                 1                  51   
4         35754     82637451   55                 3                  31   

   num_procedures  num_medications  number_outpatient  number_emergency  \
0               0               18                  0                 0   
1               5               13                  2                 0   
2               1               16                  0                 0   
3               0                8                  0                 0   
4               6               16                  0                 0   

   number_inpatient  ...  max_glu_serum_0  max_glu_serum_1  A1Cresult_0  \
0                 0  ..

In [104]:
Copy_df.to_csv('final_data.csv', index=False)
files.download('final_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>