<a href="https://colab.research.google.com/github/omarehab2110/Hospital-Readmission-Prediction/blob/main/Project_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import libraries


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import joblib
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import subprocess
import re
from termcolor import colored
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve


#Reading and Describing Data

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df_Of_Diabetic_Data=pd.read_csv('/content/drive/MyDrive/diabetic_data.csv')
# Read specific sections of the CSV file
admission_type_mapping = pd.read_csv('/content/drive/MyDrive/IDs_mapping.csv', nrows=9, names=['id', 'description'])

# Skip to discharge_disposition_id section (lines 11-40 in the file)
discharge_disposition_mapping = pd.read_csv('/content/drive/MyDrive/IDs_mapping.csv', skiprows=10, nrows=30, names=['id', 'description'])

# Skip to admission_source_id section (lines 42-67 in the file)
admission_source_mapping = pd.read_csv('/content/drive/MyDrive/IDs_mapping.csv', skiprows=41, nrows=26, names=['id', 'description'])


###Mapping

In [4]:
# --- Step 1: Read mapping files and prepare 'id' column ---
# Read admission_type_mapping, treat 'id' as string and strip whitespace
admission_type_mapping = pd.read_csv('/content/drive/MyDrive/IDs_mapping.csv', nrows=9, names=['id', 'description'], dtype={'id': str})
admission_type_mapping['id'] = admission_type_mapping['id'].str.strip()

# Read discharge_disposition_mapping, treat 'id' as string and strip whitespace
discharge_disposition_mapping = pd.read_csv('/content/drive/MyDrive/IDs_mapping.csv', skiprows=10, nrows=30, names=['id', 'description'], dtype={'id': str})
discharge_disposition_mapping['id'] = discharge_disposition_mapping['id'].str.strip()

# Read admission_source_mapping, treat 'id' as string and strip whitespace
admission_source_mapping = pd.read_csv('/content/drive/MyDrive/IDs_mapping.csv', skiprows=41, nrows=26, names=['id', 'description'], dtype={'id': str})
admission_source_mapping['id'] = admission_source_mapping['id'].str.strip()

# --- Step 2: Prepare ID columns in the main DataFrame ---
# Treat ID columns in df_Of_Diabetic_Data as string and strip whitespace
df_Of_Diabetic_Data['admission_type_id_str'] = df_Of_Diabetic_Data['admission_type_id'].astype(str).str.strip()
df_Of_Diabetic_Data['discharge_disposition_id_str'] = df_Of_Diabetic_Data['discharge_disposition_id'].astype(str).str.strip()
df_Of_Diabetic_Data['admission_source_id_str'] = df_Of_Diabetic_Data['admission_source_id'].astype(str).str.strip()

# --- Step 3: Attempt to convert stripped string IDs to integers for mapping ---
# Use a try-except block for robust conversion
try:
    admission_type_mapping['id_int'] = admission_type_mapping['id'].astype(int)
    discharge_disposition_mapping['id_int'] = discharge_disposition_mapping['id'].astype(int)
    admission_source_mapping['id_int'] = admission_source_mapping['id'].astype(int)

    df_Of_Diabetic_Data['admission_type_id_int'] = df_Of_Diabetic_Data['admission_type_id_str'].astype(int)
    df_Of_Diabetic_Data['discharge_disposition_id_int'] = df_Of_Diabetic_Data['discharge_disposition_id_str'].astype(int)
    df_Of_Diabetic_Data['admission_source_id_int'] = df_Of_Diabetic_Data['admission_source_id_str'].astype(int)

    # Use the integer columns for mapping
    map_admission_type = admission_type_mapping.set_index('id_int')['description']
    map_discharge_disposition = discharge_disposition_mapping.set_index('id_int')['description']
    map_admission_source = admission_source_mapping.set_index('id_int')['description']

    # --- Step 4: Perform the mapping ---
    df_Of_Diabetic_Data['admission_type'] = df_Of_Diabetic_Data['admission_type_id_int'].map(map_admission_type)
    df_Of_Diabetic_Data['discharge_disposition'] = df_Of_Diabetic_Data['discharge_disposition_id_int'].map(map_discharge_disposition)
    df_Of_Diabetic_Data['admission_source'] = df_Of_Diabetic_Data['admission_source_id_int'].map(map_admission_source)

    print("Mapping performed using integer IDs.")

except ValueError as e:
    print(f"Could not convert stripped string IDs to integer. Falling back to string mapping. Error: {e}")

    # Fallback to mapping using stripped string IDs if integer conversion fails
    map_admission_type = admission_type_mapping.set_index('id')['description']
    map_discharge_disposition = discharge_disposition_mapping.set_index('id')['description']
    map_admission_source = admission_source_mapping.set_index('id')['description']

    df_Of_Diabetic_Data['admission_type'] = df_Of_Diabetic_Data['admission_type_id_str'].map(map_admission_type)
    df_Of_Diabetic_Data['discharge_disposition'] = df_Of_Diabetic_Data['discharge_disposition_id_str'].map(map_discharge_disposition)
    df_Of_Diabetic_Data['admission_source'] = df_Of_Diabetic_Data['admission_source_id_str'].map(map_admission_source)

    print("Mapping performed using stripped string IDs.")

# --- Step 5: Check for any remaining unmatched IDs after mapping ---
# Check for NaN values in the newly mapped columns
print("\nNumber of NaN values in mapped columns after mapping:")
print("admission_type:", df_Of_Diabetic_Data['admission_type'].isnull().sum())
print("discharge_disposition:", df_Of_Diabetic_Data['discharge_disposition'].isnull().sum())
print("admission_source:", df_Of_Diabetic_Data['admission_source'].isnull().sum())

# Display the head to see the mapped columns
print("\nHead of mapped columns:")
print(df_Of_Diabetic_Data[['admission_type', 'discharge_disposition', 'admission_source']].head(20))

#  drop the temporary string and integer ID columns
df_Of_Diabetic_Data = df_Of_Diabetic_Data.drop(['admission_type_id_str', 'discharge_disposition_id_str', 'admission_source_id_str', 'admission_type_id_int', 'discharge_disposition_id_int', 'admission_source_id_int','admission_type_id',	'discharge_disposition_id',	'admission_source_id'], axis=1, errors='ignore')

Could not convert stripped string IDs to integer. Falling back to string mapping. Error: invalid literal for int() with base 10: 'admission_type_id'
Mapping performed using stripped string IDs.

Number of NaN values in mapped columns after mapping:
admission_type: 5291
discharge_disposition: 3691
admission_source: 6781

Head of mapped columns:
   admission_type                              discharge_disposition  \
0             NaN                                         Not Mapped   
1       Emergency                                 Discharged to home   
2       Emergency                                 Discharged to home   
3       Emergency                                 Discharged to home   
4       Emergency                                 Discharged to home   
5          Urgent                                 Discharged to home   
6        Elective                                 Discharged to home   
7       Emergency                                 Discharged to home   
8     

In [5]:
print(df_Of_Diabetic_Data.shape)
print(admission_type_mapping.shape)
print(discharge_disposition_mapping.shape)
print(admission_source_mapping.shape)

(101766, 50)
(9, 2)
(30, 2)
(26, 2)


In [6]:
df_Of_Diabetic_Data.head(10)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type,discharge_disposition,admission_source
0,2278392,8222157,Caucasian,Female,[0-10),?,1,?,Pediatrics-Endocrinology,41,...,No,No,No,No,No,No,NO,,Not Mapped,Physician Referral
1,149190,55629189,Caucasian,Female,[10-20),?,3,?,?,59,...,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
2,64410,86047875,AfricanAmerican,Female,[20-30),?,2,?,?,11,...,No,No,No,No,No,Yes,NO,Emergency,Discharged to home,Emergency Room
3,500364,82442376,Caucasian,Male,[30-40),?,2,?,?,44,...,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
4,16680,42519267,Caucasian,Male,[40-50),?,1,?,?,51,...,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
5,35754,82637451,Caucasian,Male,[50-60),?,3,?,?,31,...,No,No,No,No,No,Yes,>30,Urgent,Discharged to home,Clinic Referral
6,55842,84259809,Caucasian,Male,[60-70),?,4,?,?,70,...,No,No,No,No,Ch,Yes,NO,Elective,Discharged to home,Clinic Referral
7,63768,114882984,Caucasian,Male,[70-80),?,5,?,?,73,...,No,No,No,No,No,Yes,>30,Emergency,Discharged to home,Emergency Room
8,12522,48330783,Caucasian,Female,[80-90),?,13,?,?,68,...,No,No,No,No,Ch,Yes,NO,Urgent,Discharged to home,Transfer from a hospital
9,15738,63555939,Caucasian,Female,[90-100),?,12,?,InternalMedicine,33,...,No,No,No,No,Ch,Yes,NO,Elective,Discharged/transferred to SNF,Transfer from a hospital


In [7]:
df_Of_Diabetic_Data.isnull().sum()

Unnamed: 0,0
encounter_id,0
patient_nbr,0
race,0
gender,0
age,0
weight,0
time_in_hospital,0
payer_code,0
medical_specialty,0
num_lab_procedures,0


In [8]:
df_Of_Diabetic_Data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'admission_type', 'discharge_disposition', 'admission_source'],
      dtype='object')

In [9]:
df_Of_Diabetic_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   time_in_hospital          101766 non-null  int64 
 7   payer_code                101766 non-null  object
 8   medical_specialty         101766 non-null  object
 9   num_lab_procedures        101766 non-null  int64 
 10  num_procedures            101766 non-null  int64 
 11  num_medications           101766 non-null  int64 
 12  number_outpatient         101766 non-null  int64 
 13  number_emergency          101766 non-null  int64 
 14  numb

In [10]:
df_Of_Diabetic_Data.describe()

Unnamed: 0,encounter_id,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [11]:
df_Of_Diabetic_Data.duplicated().sum()

np.int64(0)

In [12]:
# Apply the transformation and assign it back to the 'readmitted' column
df_Of_Diabetic_Data['readmitted'] = df_Of_Diabetic_Data['readmitted'].apply(lambda x: 1 if x == '<30' else 0)

# Now the 'readmitted' column contains the binary values (0 or 1)
print(df_Of_Diabetic_Data['readmitted'].value_counts())

readmitted
0    90409
1    11357
Name: count, dtype: int64


In [13]:
for col in df_Of_Diabetic_Data.columns:
    # Define the list of values to count
    values_to_count = ['?', 'NAN', "Unknown/Invalid", 'Not Mapped', 'NULL', 'Not Available']

    # Count the number of occurrences of any of the values in the list

    count = df_Of_Diabetic_Data[col].isin(values_to_count).sum()
    df_Of_Diabetic_Data[col].replace(values_to_count, np.nan, inplace=True)

    # Print the count for each column
    if count > 0:
     print(f"Column '{col}': {count} problematic values")

Column 'race': 2273 problematic values
Column 'gender': 3 problematic values
Column 'weight': 98569 problematic values
Column 'payer_code': 40256 problematic values
Column 'medical_specialty': 49949 problematic values


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_Of_Diabetic_Data[col].replace(values_to_count, np.nan, inplace=True)


Column 'diag_1': 21 problematic values
Column 'diag_2': 358 problematic values
Column 'diag_3': 1423 problematic values
Column 'admission_type': 5105 problematic values
Column 'discharge_disposition': 989 problematic values


#Data preprocessing

###Dropping and Handling Nulls

In [14]:
# removing columns that have a lot of nulls as having >50% of the data
df_Of_Diabetic_Data.drop(['weight', 'payer_code', 'medical_specialty','max_glu_serum' , 'A1Cresult'], axis=1, inplace=True)

# deleting ncounter_id	patient_nbr	as no relation between them and the readmitted
df_Of_Diabetic_Data.drop(['encounter_id','patient_nbr'], axis=1, inplace=True)

df_Of_Diabetic_Data


Unnamed: 0,race,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type,discharge_disposition,admission_source
0,Caucasian,Female,[0-10),1,41,0,1,0,0,0,...,No,No,No,No,No,No,0,,,Physician Referral
1,Caucasian,Female,[10-20),3,59,0,18,0,0,0,...,No,No,No,No,Ch,Yes,0,Emergency,Discharged to home,Emergency Room
2,AfricanAmerican,Female,[20-30),2,11,5,13,2,0,1,...,No,No,No,No,No,Yes,0,Emergency,Discharged to home,Emergency Room
3,Caucasian,Male,[30-40),2,44,1,16,0,0,0,...,No,No,No,No,Ch,Yes,0,Emergency,Discharged to home,Emergency Room
4,Caucasian,Male,[40-50),1,51,0,8,0,0,0,...,No,No,No,No,Ch,Yes,0,Emergency,Discharged to home,Emergency Room
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),3,51,0,16,0,0,0,...,No,No,No,No,Ch,Yes,0,Emergency,Discharged/transferred to SNF,Emergency Room
101762,AfricanAmerican,Female,[80-90),5,33,3,18,0,0,1,...,No,No,No,No,No,Yes,0,Emergency,Discharged/transferred to ICF,Transfer from a Skilled Nursing Facility (SNF)
101763,Caucasian,Male,[70-80),1,53,0,9,1,0,0,...,No,No,No,No,Ch,Yes,0,Emergency,Discharged to home,Emergency Room
101764,Caucasian,Female,[80-90),10,45,2,21,0,0,1,...,No,No,No,No,Ch,Yes,0,Urgent,Discharged/transferred to SNF,Emergency Room


In [15]:
for col in df_Of_Diabetic_Data.columns:
    if df_Of_Diabetic_Data[col].isnull().any():
        if df_Of_Diabetic_Data[col].dtype == 'object':
            df_Of_Diabetic_Data[col] = df_Of_Diabetic_Data[col].fillna(df_Of_Diabetic_Data[col].mode()[0])
        else:
            df_Of_Diabetic_Data[col] = df_Of_Diabetic_Data[col].fillna(df_Of_Diabetic_Data[col].median())


In [16]:
df_Of_Diabetic_Data.isnull().sum()

Unnamed: 0,0
race,0
gender,0
age,0
time_in_hospital,0
num_lab_procedures,0
num_procedures,0
num_medications,0
number_outpatient,0
number_emergency,0
number_inpatient,0


###Encoding

In [17]:

label_enc = LabelEncoder()

# Select only categorical columns
categorical_cols = df_Of_Diabetic_Data.select_dtypes(include='object').columns.tolist()

# Split into binary, safe multi-category, and high-cardinality
binary_cols = [col for col in categorical_cols if df_Of_Diabetic_Data[col].nunique() == 2]
safe_multiclass_cols = [col for col in categorical_cols if 3 <= df_Of_Diabetic_Data[col].nunique() <= 20]

# 1. Label encode binary columns
for col in binary_cols:
    df_Of_Diabetic_Data[col] = label_enc.fit_transform(df_Of_Diabetic_Data[col])

# 2. One-hot encode only safe multiclass columns
df_Of_Diabetic_Data = pd.get_dummies(df_Of_Diabetic_Data, columns=safe_multiclass_cols, drop_first=True)
## Very Simple Convert Boolean Values to Integers

# Identify columns that are of boolean type
boolean_cols = df_Of_Diabetic_Data.select_dtypes(include=bool).columns

# Convert boolean columns to integers (True=1, False=0)
for col in boolean_cols:
    df_Of_Diabetic_Data[col] = df_Of_Diabetic_Data[col].astype(int)
    print(f"Converted boolean column '{col}' to integers (0/1).")

# Display data types to verify
print("\nData types after converting boolean columns:")
print(df_Of_Diabetic_Data.info())


Converted boolean column 'race_Asian' to integers (0/1).
Converted boolean column 'race_Caucasian' to integers (0/1).
Converted boolean column 'race_Hispanic' to integers (0/1).
Converted boolean column 'race_Other' to integers (0/1).
Converted boolean column 'age_[10-20)' to integers (0/1).
Converted boolean column 'age_[20-30)' to integers (0/1).
Converted boolean column 'age_[30-40)' to integers (0/1).
Converted boolean column 'age_[40-50)' to integers (0/1).
Converted boolean column 'age_[50-60)' to integers (0/1).
Converted boolean column 'age_[60-70)' to integers (0/1).
Converted boolean column 'age_[70-80)' to integers (0/1).
Converted boolean column 'age_[80-90)' to integers (0/1).
Converted boolean column 'age_[90-100)' to integers (0/1).
Converted boolean column 'metformin_No' to integers (0/1).
Converted boolean column 'metformin_Steady' to integers (0/1).
Converted boolean column 'metformin_Up' to integers (0/1).
Converted boolean column 'repaglinide_No' to integers (0/1).


In [18]:
df_Of_Diabetic_Data

Unnamed: 0,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,...,admission_source_ Sick Baby,admission_source_ Transfer from Ambulatory Surgery Center,admission_source_ Transfer from a Skilled Nursing Facility (SNF),admission_source_ Transfer from another health care facility,admission_source_ Transfer from critial access hospital,admission_source_ Transfer from hospital inpt/same fac reslt in a sep claim,admission_source_Clinic Referral,admission_source_HMO Referral,admission_source_Normal Delivery,admission_source_Transfer from a hospital
0,0,1,41,0,1,0,0,0,250.83,276,...,0,0,0,0,0,0,0,0,0,0
1,0,3,59,0,18,0,0,0,276,250.01,...,0,0,0,0,0,0,0,0,0,0
2,0,2,11,5,13,2,0,1,648,250,...,0,0,0,0,0,0,0,0,0,0
3,1,2,44,1,16,0,0,0,8,250.43,...,0,0,0,0,0,0,0,0,0,0
4,1,1,51,0,8,0,0,0,197,157,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,1,3,51,0,16,0,0,0,250.13,291,...,0,0,0,0,0,0,0,0,0,0
101762,0,5,33,3,18,0,0,1,560,276,...,0,0,1,0,0,0,0,0,0,0
101763,1,1,53,0,9,1,0,0,38,590,...,0,0,0,0,0,0,0,0,0,0
101764,0,10,45,2,21,0,0,1,996,285,...,0,0,0,0,0,0,0,0,0,0


###Normalize using MinMax

In [19]:
scaler = MinMaxScaler()
diag_cols = ['diag_1', 'diag_2', 'diag_3']

for col in diag_cols:
    # Attempt to convert the column to numeric.
    # 'errors='coerce'' will turn any values that cannot be converted into NaN.
    df_Of_Diabetic_Data[col] = pd.to_numeric(df_Of_Diabetic_Data[col], errors='coerce')

# Select only numerical columns
numerical_cols = df_Of_Diabetic_Data.select_dtypes(include=['int64', 'float64']).drop('readmitted', axis=1).columns
# Exclude the target variable 'readmitted' if it's in the list
if 'readmitted' in numerical_cols:
    numerical_cols.remove('readmitted')

# Apply Min-Max scaling to the numerical columns
df_Of_Diabetic_Data[numerical_cols] = scaler.fit_transform(df_Of_Diabetic_Data[numerical_cols])

# Display the head to see the normalized col
df_Of_Diabetic_Data.head(10)

Unnamed: 0,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,...,admission_source_ Sick Baby,admission_source_ Transfer from Ambulatory Surgery Center,admission_source_ Transfer from a Skilled Nursing Facility (SNF),admission_source_ Transfer from another health care facility,admission_source_ Transfer from critial access hospital,admission_source_ Transfer from hospital inpt/same fac reslt in a sep claim,admission_source_Clinic Referral,admission_source_HMO Referral,admission_source_Normal Delivery,admission_source_Transfer from a hospital
0,0.0,0.0,0.305344,0.0,0.0,0.0,0.0,0.0,0.248825,0.272636,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.153846,0.442748,0.0,0.2125,0.0,0.0,0.0,0.274096,0.246489,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.076923,0.076336,0.833333,0.15,0.047619,0.0,0.047619,0.64759,0.246479,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.076923,0.328244,0.166667,0.1875,0.0,0.0,0.0,0.00502,0.246911,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.381679,0.0,0.0875,0.0,0.0,0.0,0.194779,0.152918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.153846,0.229008,1.0,0.1875,0.0,0.0,0.0,0.412651,0.408451,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,1.0,0.230769,0.526718,0.166667,0.25,0.0,0.0,0.0,0.412651,0.408451,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,1.0,0.307692,0.549618,0.0,0.1375,0.0,0.0,0.0,0.426707,0.48994,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.923077,0.51145,0.333333,0.3375,0.0,0.0,0.0,0.396586,0.424547,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,0.0,0.846154,0.244275,0.5,0.2125,0.0,0.0,0.0,0.432731,0.194165,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [20]:
df_Of_Diabetic_Data.to_csv('cleaned_diabetic_data.csv', index=False)


### Handling outliers

In [21]:
## More Nuanced Outlier Handling (Example: Removing outliers in a specific column)

numerical_cols_for_outliers = df_Of_Diabetic_Data.select_dtypes(include=np.number).columns.tolist()
if 'readmitted' in numerical_cols_for_outliers:
    numerical_cols_for_outliers.remove('readmitted')

# Define columns where you might want to remove outliers (example)
cols_to_remove_outliers_from = ['time_in_hospital'] # Example: Assuming extreme hospital stays might be errors

print("Handling Outliers using IQR:")

for col in numerical_cols_for_outliers:
    Q1 = df_Of_Diabetic_Data[col].quantile(0.25)
    Q3 = df_Of_Diabetic_Data[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df_Of_Diabetic_Data[(df_Of_Diabetic_Data[col] < lower_bound) | (df_Of_Diabetic_Data[col] > upper_bound)]
    outlier_count = outliers.shape[0]

    if outlier_count > 0:
        print(f"Column '{col}': Found {outlier_count} outliers.")

        if col in cols_to_remove_outliers_from:
            # Remove rows if the column is in the 'remove' list
            initial_shape = df_Of_Diabetic_Data.shape
            df_Of_Diabetic_Data = df_Of_Diabetic_Data.drop(outliers.index)
            print(f"  - Removed {outlier_count} rows containing outliers in '{col}'. New shape: {df_Of_Diabetic_Data.shape}")
        else:
            # Otherwise, cap the outliers
            df_Of_Diabetic_Data[col] = np.where(df_Of_Diabetic_Data[col] < lower_bound, lower_bound, df_Of_Diabetic_Data[col])
            df_Of_Diabetic_Data[col] = np.where(df_Of_Diabetic_Data[col] > upper_bound, upper_bound, df_Of_Diabetic_Data[col])
            print(f"  - Capped outliers in '{col}'.")

print(f"\nFinal DataFrame shape after outlier handling: {df_Of_Diabetic_Data.shape}")

Handling Outliers using IQR:
Column 'time_in_hospital': Found 2252 outliers.
  - Removed 2252 rows containing outliers in 'time_in_hospital'. New shape: (99514, 98)
Column 'num_lab_procedures': Found 118 outliers.
  - Capped outliers in 'num_lab_procedures'.
Column 'num_procedures': Found 4665 outliers.
  - Capped outliers in 'num_procedures'.
Column 'num_medications': Found 2561 outliers.
  - Capped outliers in 'num_medications'.
Column 'number_outpatient': Found 16417 outliers.
  - Capped outliers in 'number_outpatient'.
Column 'number_emergency': Found 11191 outliers.
  - Capped outliers in 'number_emergency'.
Column 'number_inpatient': Found 6821 outliers.
  - Capped outliers in 'number_inpatient'.
Column 'diag_1': Found 6426 outliers.
  - Capped outliers in 'diag_1'.
Column 'diag_2': Found 1716 outliers.
  - Capped outliers in 'diag_2'.
Column 'diag_3': Found 1626 outliers.
  - Capped outliers in 'diag_3'.
Column 'number_diagnoses': Found 277 outliers.
  - Capped outliers in 'numb