In [109]:
from collections import Counter
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [152]:
df = pd.read_csv('../diabetic_data.csv')
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [154]:
df.shape

(101766, 50)

In [155]:
df.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [156]:
df = df.drop('encounter_id', axis=1)

We found out that, for some patient there are number of entries in the dataset these duplicate entries are not helpful in solving the task hence lets remove these entries.

In [157]:
print('Total data = ', len(df))
print('Unique entries = ', len(np.unique(df['patient_nbr'])))
df.drop_duplicates(['patient_nbr'], keep = 'first', inplace = True)
print('Length after removing Duplicates:', len(df))

Total data =  101766
Unique entries =  71518
Length after removing Duplicates: 71518


In [158]:
# replacing '?' wiht NaN
df.replace(regex=r'\?', value=np.nan, inplace=True)

In [159]:
df.isnull().sum()

patient_nbr                     0
race                         1948
gender                          0
age                             0
weight                      68665
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  31043
medical_specialty           34477
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         11
diag_2                        294
diag_3                       1225
number_diagnoses                0
max_glu_serum               68062
A1Cresult                   58532
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide     

In [160]:
missing_values = df.isnull().sum()

# Create a DataFrame with only the columns containing missing values
missing_data_table = pd.DataFrame({'Column Name': df.columns,
                                   'Missing Values': missing_values[df.columns],
                                   'Missing Percentage': (missing_values[df.columns] / len(df)) * 100})

# Display the missing data table
print(missing_data_table.to_string(index=False))

             Column Name  Missing Values  Missing Percentage
             patient_nbr               0            0.000000
                    race            1948            2.723790
                  gender               0            0.000000
                     age               0            0.000000
                  weight           68665           96.010794
       admission_type_id               0            0.000000
discharge_disposition_id               0            0.000000
     admission_source_id               0            0.000000
        time_in_hospital               0            0.000000
              payer_code           31043           43.405856
       medical_specialty           34477           48.207444
      num_lab_procedures               0            0.000000
          num_procedures               0            0.000000
         num_medications               0            0.000000
       number_outpatient               0            0.000000
        number_emergency

In [161]:
# dropping columns with large number of missing values (>80%)
df = df.drop(['weight', 'max_glu_serum', 'A1Cresult', 'payer_code'], axis=1)

In [162]:
df.shape

(71518, 45)

In [163]:
# these are categorical variables so we can't use mean of median.
diag_1 = df['diag_1'].mode()[0]
diag_2 = df['diag_2'].mode()[0]
diag_3 = df['diag_3'].mode()[0]

df['diag_1'] = df['diag_1'].fillna(diag_1)
df['diag_2'] = df['diag_2'].fillna(diag_2)
df['diag_3'] = df['diag_3'].fillna(diag_3)

In [164]:
df.isnull().sum()

patient_nbr                     0
race                         1948
gender                          0
age                             0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
medical_specialty           34477
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide                       0
glyburide                       0
tolbutamide                     0
pioglitazone                    0
rosiglitazone 

In [165]:
df.shape

(71518, 45)

After research we found that medical_specialty is important feature however it has too many distinct values so when we apply one hot encoding it will unneccessarily create lots of features. To group them into smaller number of categories we used frequency based approach and domain knowledege like all kind of surgons should be put under 'surgon' category. <br>

<p>
We failed to group below medical specialist so we grouped them into 'ungrouped' category. <br>
Endocrinology -- glands <br>
Gastroenterology --stomach <br>
Gynecology -- women reproduction system <br>
Hematology -- Blood <br>
Hematology/Oncology -- Blood <br>
Hospitalist -- one who takes care of admitted patients <br>
Oncology -- cancer <br>
Ophthalmology -- eye <br>
otolaryngology -- ears, nose, and throat <br>
Pulmonology -- respiratory <br>
radiology -- diagnosing and treating injuries and diseases using medical imaging (radiology) procedures (exams/tests) such as X-rays <br>
</p>

In [166]:
high_frequency = ['InternalMedicine', 'Family/GeneralPractice', 'Cardiology', 'Surgery-General', 'Orthopedics', 'Orthopedics-Reconstructive', 
                 'Emergency/Trauma', 'Urology','ObstetricsandGynecology','Psychiatry','Pulmonology ','Nephrology','Radiologist']

low_frequency = ['Surgery-PlasticwithinHeadandNeck','Psychiatry-Addictive','Proctology','Dermatology','SportsMedicine','Speech','Perinatology',\
                'Neurophysiology','Resident','Pediatrics-Hematology-Oncology','Pediatrics-EmergencyMedicine','Dentistry','DCPTEAM','Psychiatry-Child/Adolescent',\
                'Pediatrics-Pulmonology','Surgery-Pediatric','AllergyandImmunology','Pediatrics-Neurology','Anesthesiology','Pathology','Cardiology-Pediatric',\
                'Endocrinology-Metabolism','PhysicianNotFound','Surgery-Colon&Rectal','OutreachServices',\
                'Surgery-Maxillofacial','Rheumatology','Anesthesiology-Pediatric','Obstetrics','Obsterics&Gynecology-GynecologicOnco']

pediatrics = ['Pediatrics','Pediatrics-CriticalCare','Pediatrics-EmergencyMedicine','Pediatrics-Endocrinology','Pediatrics-Hematology-Oncology',\
               'Pediatrics-Neurology','Pediatrics-Pulmonology', 'Anesthesiology-Pediatric', 'Cardiology-Pediatric', 'Surgery-Pediatric']

psychic = ['Psychiatry-Addictive', 'Psychology', 'Psychiatry',  'Psychiatry-Child/Adolescent', 'PhysicalMedicineandRehabilitation', 'Osteopath']


neurology = ['Neurology', 'Surgery-Neuro',  'Pediatrics-Neurology', 'Neurophysiology']


surgery = ['Surgeon', 'Surgery-Cardiovascular', \
          'Surgery-Cardiovascular/Thoracic', 'Surgery-Colon&Rectal', 'Surgery-General', 'Surgery-Maxillofacial', \
             'Surgery-Plastic', 'Surgery-PlasticwithinHeadandNeck',  'Surgery-Thoracic',\
             'Surgery-Vascular', 'SurgicalSpecialty', 'Podiatry']
             
ungrouped = ['Endocrinology','Gastroenterology','Gynecology','Hematology','Hematology/Oncology','Hospitalist','InfectiousDiseases',\
           'Oncology','Ophthalmology','Otolaryngology','Pulmonology','Radiology']



colMedical = []

for val in df['medical_specialty'] :
    if val in pediatrics :
        colMedical.append('pediatrics')
    elif val in psychic :
        colMedical.append('psychic')
    elif val in neurology :
        colMedical.append('neurology')
    elif val in surgery :
        colMedical.append('surgery')
    elif val in high_frequency :
        colMedical.append('high_freq')
    elif val in low_frequency :
        colMedical.append('low_freq')
    elif val in ungrouped :
        colMedical.append('ungrouped')
    else:
        colMedical.append('missing')

df['medical_specialty'] = colMedical

In [167]:
df.isnull().sum()

patient_nbr                    0
race                        1948
gender                         0
age                            0
admission_type_id              0
discharge_disposition_id       0
admission_source_id            0
time_in_hospital               0
medical_specialty              0
num_lab_procedures             0
num_procedures                 0
num_medications                0
number_outpatient              0
number_emergency               0
number_inpatient               0
diag_1                         0
diag_2                         0
diag_3                         0
number_diagnoses               0
metformin                      0
repaglinide                    0
nateglinide                    0
chlorpropamide                 0
glimepiride                    0
acetohexamide                  0
glipizide                      0
glyburide                      0
tolbutamide                    0
pioglitazone                   0
rosiglitazone                  0
acarbose  

In [168]:
df['gender'][df['gender'] == 'Unknown/Invalid'].count()

3

In [169]:
df.drop(df[df['gender'] == 'Unknown/Invalid'].index, inplace=True)

In [170]:
df.race.unique()

array(['Caucasian', 'AfricanAmerican', nan, 'Other', 'Asian', 'Hispanic'],
      dtype=object)

In [171]:
print("Proportion of Race")
print(df.race.value_counts(normalize = True)*100)

Proportion of Race
race
Caucasian          76.889132
AfricanAmerican    18.524055
Hispanic            2.180569
Other               1.691846
Asian               0.714399
Name: proportion, dtype: float64


In [172]:
mapped_race = {"Asian":"Other","Hispanic":"Other"}
df.race = df.race.replace(mapped_race)

print("Proportion of Race After the Mapping")
print(df.race.value_counts(normalize= True)*100)

Proportion of Race After the Mapping
race
Caucasian          76.889132
AfricanAmerican    18.524055
Other               4.586813
Name: proportion, dtype: float64


In [173]:
df["race"].fillna(df["race"].mode()[0], inplace = True)

In [174]:
df.shape

(71515, 45)

In [177]:
df.citoglipton.unique()

array(['No'], dtype=object)

In [178]:
df.examide.unique()

array(['No'], dtype=object)

In [179]:
df = df.drop(['citoglipton', 'examide'], axis = 1)

In [85]:
df.columns

Index(['patient_nbr', 'race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'metformin', 'repaglinide',
       'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [184]:
cleaned_df = df

# Define the file path where you want to save the CSV file
file_path = 'cleaned_data.csv'

# Save the DataFrame as a CSV file
cleaned_df.to_csv(file_path, index=False)