In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn import preprocessing
import scipy 

In [2]:
df = pd.read_csv('diabetic_data.csv')
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


## Exploring the Data

### Part1 : Building up a basic predictive Model

#### Data cleaning and Transformation

In [3]:
df.shape

(101766, 50)

In [4]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [5]:
#delete the column encounter_id
df = df.drop('encounter_id', axis=1)

In [6]:
# display data types of each columns
df.dtypes

patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide       

### identifying missing values

In [7]:
# columns with the missing values represented by '?'
counts = (df == '?').sum()
counts

patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide     

In [8]:
len(df)

101766

In [9]:
len(df.patient_nbr.unique())

71518

In [10]:
df.drop_duplicates(['patient_nbr'], keep='first', inplace=True)
len(df)

71518

In [11]:
# replacing '?' wiht NaN
df.replace(regex=r'\?', value=np.nan, inplace=True)

In [12]:
df.isnull().sum()

patient_nbr                     0
race                         1948
gender                          0
age                             0
weight                      68665
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  31043
medical_specialty           34477
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         11
diag_2                        294
diag_3                       1225
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide     

In [13]:
missing_values = df.isnull().sum()

# Get the column names with missing values
columns_with_missing_values = missing_values[missing_values > 0].index.tolist()

# Create a DataFrame with only the columns containing missing values
missing_data_table = pd.DataFrame({'Column Name': columns_with_missing_values,
                                   'Missing Values': missing_values[columns_with_missing_values],
                                   'Missing Percentage': (missing_values[columns_with_missing_values] / len(df)) * 100})

# Display the missing data table
print(missing_data_table.to_string(index=False))

      Column Name  Missing Values  Missing Percentage
             race            1948            2.723790
           weight           68665           96.010794
       payer_code           31043           43.405856
medical_specialty           34477           48.207444
           diag_1              11            0.015381
           diag_2             294            0.411085
           diag_3            1225            1.712856


In [14]:
#dropping columns with large number of missing values
df = df.drop(['weight','payer_code'], axis = 1)

In [15]:
#Some columns have no variations. The variables'examide'and'citoglipton'have only one value. 
# These columns are not useful in prediction and can be deleted. Delete the following near zero-variance colums:
# no variations in cols. these cols are not useful in prediction and can be deleted
cols_to_delete = ['repaglinide', 'nateglinide','chlorpropamide','glimepiride','acetohexamide','tolbutamide',
                  'acarbose','miglitol','troglitazone','tolazamide','examide','citoglipton','glyburide-metformin',
                  'glipizide-metformin','glimepiride-pioglitazone','metformin-rosiglitazone',
                  'metformin-pioglitazone']
# Drop multiple columns
df.drop(columns=cols_to_delete, inplace=True)
df.columns

Index(['patient_nbr', 'race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'medical_specialty', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'glipizide', 'glyburide',
       'pioglitazone', 'rosiglitazone', 'insulin', 'change', 'diabetesMed',
       'readmitted'],
      dtype='object')

In [16]:
df.isnull().sum()

patient_nbr                     0
race                         1948
gender                          0
age                             0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
medical_specialty           34477
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         11
diag_2                        294
diag_3                       1225
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
glipizide                       0
glyburide                       0
pioglitazone                    0
rosiglitazone                   0
insulin                         0
change                          0
diabetesMed                     0
readmitted    

#After research we found that medical_specialty is important feature however it has too many distinct values so when we apply one hot encoding it will unneccessarily create lots of features. To group them into smaller number of categories we used frequency based approach and domain knowledege like all kind of surgons should be put under 'surgon' category.

#We failed to group below medical specialist so we grouped them into 'ungrouped' category.
#Endocrinology -- glands
#Gastroenterology --stomach
#Gynecology -- women reproduction system
#Hematology -- Blood
#Hematology/Oncology -- Blood
#Hospitalist -- one who takes care of admitted patients
#Oncology -- cancer
#Ophthalmology -- eye
#otolaryngology -- ears, nose, and throat
#Pulmonology -- respiratory
#radiology -- diagnosing and treating injuries and diseases using medical imaging (radiology) procedures (exams/tests) such as X-rays

In [17]:
high_frequency = ['InternalMedicine', 'Family/GeneralPractice', 'Cardiology', 'Surgery-General', 'Orthopedics', 'Orthopedics-Reconstructive', 
                 'Emergency/Trauma', 'Urology','ObstetricsandGynecology','Psychiatry','Pulmonology ','Nephrology','Radiologist']

low_frequency = ['Surgery-PlasticwithinHeadandNeck','Psychiatry-Addictive','Proctology','Dermatology','SportsMedicine','Speech','Perinatology',\
                'Neurophysiology','Resident','Pediatrics-Hematology-Oncology','Pediatrics-EmergencyMedicine','Dentistry','DCPTEAM','Psychiatry-Child/Adolescent',\
                'Pediatrics-Pulmonology','Surgery-Pediatric','AllergyandImmunology','Pediatrics-Neurology','Anesthesiology','Pathology','Cardiology-Pediatric',\
                'Endocrinology-Metabolism','PhysicianNotFound','Surgery-Colon&Rectal','OutreachServices',\
                'Surgery-Maxillofacial','Rheumatology','Anesthesiology-Pediatric','Obstetrics','Obsterics&Gynecology-GynecologicOnco']

pediatrics = ['Pediatrics','Pediatrics-CriticalCare','Pediatrics-EmergencyMedicine','Pediatrics-Endocrinology','Pediatrics-Hematology-Oncology',\
               'Pediatrics-Neurology','Pediatrics-Pulmonology', 'Anesthesiology-Pediatric', 'Cardiology-Pediatric', 'Surgery-Pediatric']

psychic = ['Psychiatry-Addictive', 'Psychology', 'Psychiatry',  'Psychiatry-Child/Adolescent', 'PhysicalMedicineandRehabilitation', 'Osteopath']


neurology = ['Neurology', 'Surgery-Neuro',  'Pediatrics-Neurology', 'Neurophysiology']


surgery = ['Surgeon', 'Surgery-Cardiovascular', \
          'Surgery-Cardiovascular/Thoracic', 'Surgery-Colon&Rectal', 'Surgery-General', 'Surgery-Maxillofacial', \
             'Surgery-Plastic', 'Surgery-PlasticwithinHeadandNeck',  'Surgery-Thoracic',\
             'Surgery-Vascular', 'SurgicalSpecialty', 'Podiatry']
             
ungrouped = ['Endocrinology','Gastroenterology','Gynecology','Hematology','Hematology/Oncology','Hospitalist','InfectiousDiseases',\
           'Oncology','Ophthalmology','Otolaryngology','Pulmonology','Radiology']

colMedical = []

for val in df['medical_specialty']:
    if val in pediatrics:
        colMedical.append('pediatrics')
    elif val in psychic:
        colMedical.append('psychic')
    elif val in neurology:
        colMedical.append('neurology')
    elif val in surgery:
        colMedical.append('surgery')
    elif val in high_frequency:
        colMedical.append('high_freq')
    elif val in low_frequency:
        colMedical.append('low_freq')
    elif val in ungrouped:
        colMedical.append('ungrouped')
    else:
        colMedical.append('missing')

df['medical_specialty'] = colMedical       

##### removing rows with unknown/invalid gender value

In [18]:
df.drop(df[df['gender'] == 'Unknown/Invalid'].index, inplace=True)

###### Lets deal with diag_1 , diag_2 and diag_3 one by one

As featues diag_1 , diag_2, diag_3 have mixed type values like 11, 'V25' 'V26' 'V43' So we here cant apply class mean, median so lets try replacing missing valuse with most common value

In [19]:
diag_1 = Counter(list(df['diag_1'])).most_common(1)[0][0]
diag_2 = Counter(list(df['diag_2'])).most_common(1)[0][0]
diag_3 = Counter(list(df['diag_3'])).most_common(1)[0][0]

# Replace NaN values in each 'diag' column with the corresponding most common value
df['diag_1'] = df['diag_1'].apply(lambda x: diag_1 if pd.isna(x) else x)
df['diag_2'] = df['diag_2'].apply(lambda x: diag_2 if pd.isna(x) else x)
df['diag_3'] = df['diag_3'].apply(lambda x: diag_3 if pd.isna(x) else x)

In [20]:
# response variable 'readmitted', <30 -> 0, >30 1
df['readmitted'] = df['readmitted'].replace({'<30': 1, '>30': 0, 'NO': 0})

#### removing rows with null values for race 

In [21]:
df.race.isnull().sum()/len(df) * 100

2.721107459973432

In [22]:
df = df.dropna(subset=['race'])

In [23]:
df.isnull().sum()

patient_nbr                 0
race                        0
gender                      0
age                         0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
medical_specialty           0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
diag_2                      0
diag_3                      0
number_diagnoses            0
max_glu_serum               0
A1Cresult                   0
metformin                   0
glipizide                   0
glyburide                   0
pioglitazone                0
rosiglitazone               0
insulin                     0
change                      0
diabetesMed                 0
readmitted                  0
dtype: int64

In [24]:
df.shape

(69569, 30)

#### transforming Age column 

In [25]:
np.unique(df['age'])

array(['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)',
       '[60-70)', '[70-80)', '[80-90)', '[90-100)'], dtype=object)

In [26]:
replace_dict = {'[0-10)' : 5,
'[10-20)' : 15,
'[20-30)' : 25, 
'[30-40)' : 35, 
'[40-50)' : 45, 
'[50-60)' : 55,
'[60-70)' : 65, 
'[70-80)' : 75,
'[80-90)' : 85,
'[90-100)' : 95}

df['age'] = df['age'].apply(lambda x : replace_dict[x])
print(df['age'].head())

0     5
1    15
2    25
3    35
4    45
Name: age, dtype: int64


In [27]:
df.isnull().sum()

patient_nbr                 0
race                        0
gender                      0
age                         0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
medical_specialty           0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
diag_2                      0
diag_3                      0
number_diagnoses            0
max_glu_serum               0
A1Cresult                   0
metformin                   0
glipizide                   0
glyburide                   0
pioglitazone                0
rosiglitazone               0
insulin                     0
change                      0
diabetesMed                 0
readmitted                  0
dtype: int64

In [28]:
df.shape

(69569, 30)

In [29]:
clean_df = df

# Define the file path where you want to save the CSV file
file_path = 'clean_data.csv'

# Save the DataFrame as a CSV file
clean_df.to_csv(file_path, index=False)  #