## Data Cleaner

In [44]:
import os
import pandas as pd
import numpy as np
from collections import Counter
import sklearn as sk
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


pd.options.display.max_columns = None
pd.options.display.max_rows = 200

In [30]:
# Load data
data = pd.read_csv(os.path.join("dataset", "diabetic_data.csv"))
data.head(10)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),?,2,1,2,3,?,?,31,6,16,0,0,0,414.0,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
6,55842,84259809,Caucasian,Male,[60-70),?,3,1,2,4,?,?,70,1,21,0,0,0,414.0,411,V45,7,,,Steady,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
7,63768,114882984,Caucasian,Male,[70-80),?,1,1,7,5,?,?,73,0,12,0,0,0,428.0,492,250,8,,,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,>30
8,12522,48330783,Caucasian,Female,[80-90),?,2,1,4,13,?,?,68,2,28,0,0,0,398.0,427,38,8,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
9,15738,63555939,Caucasian,Female,[90-100),?,3,3,4,12,?,InternalMedicine,33,3,18,0,0,0,434.0,198,486,8,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [31]:
data.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [32]:
# Convert age frm categorical to numeric
# due to age contain more information
replaceDict = {'[0-10)' : 5,
'[10-20)' : 15,
'[20-30)' : 25, 
'[30-40)' : 35, 
'[40-50)' : 45, 
'[50-60)' : 55,
'[60-70)' : 65, 
'[70-80)' : 75,
'[80-90)' : 85,
'[90-100)' : 95}

data['age'] = data['age'].apply(lambda x : replaceDict[x])





In [33]:
print('Total data = ', len(data))
# data.drop_duplicates(['patient_nbr'], keep = 'first', inplace = True)

Total data =  101766


In [60]:
# Remove nonuseful features
data = data.drop(columns=["encounter_id", "patient_nbr", "payer_code", "weight"])

#remove features with low variance
data = data.drop(columns=["max_glu_serum","repaglinide","nateglinide","chlorpropamide","glimepiride","acetohexamide","tolbutamide",              "acarbose",                
"miglitol" ,                "troglitazone"   ,         
"tolazamide"           ,    "examide"     ,            
"citoglipton"         ,     "glyburide-metformin"    , 
"glipizide-metformin"    ,  "glimepiride-pioglitazone",
"metformin-rosiglitazone" , "metformin-pioglitazone"])

    

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,A1Cresult,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,Caucasian,Female,5,5,18,1,1,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,0,-20,-20,-20,-20,-20,-20,-1,-1,NO
1,Caucasian,Female,15,1,1,7,3,?,59,0,18,0,0,0,276.0,250.01,255,9,0,-20,-20,-20,-20,-20,10,1,1,>30
2,AfricanAmerican,Female,25,1,1,7,2,?,11,5,13,2,0,1,648.0,250,V27,6,0,-20,0,-20,-20,-20,-20,-1,1,NO
3,Caucasian,Male,35,1,1,7,2,?,44,1,16,0,0,0,8.0,250.43,403,7,0,-20,-20,-20,-20,-20,10,1,1,NO
4,Caucasian,Male,45,1,1,7,1,?,51,0,8,0,0,0,197.0,157,250,5,0,-20,0,-20,-20,-20,0,1,1,NO


In [35]:
# Group some ids mapping to 1
# [6, 8, 9, 13] discharge to home
# [3, 4, 5, 14, 22, 23, 24] discharge to medical facility
# [12, 15, 16, 17] discharge, related to institution
# [19, 20, 21] expired
# [25, 26] unknow or invalid

data['discharge_disposition_id'] = data['discharge_disposition_id'].apply(lambda x : 1 if int(x) in [6, 8, 9, 13] 
                                                                           else ( 2 if int(x) in [3, 4, 5, 14, 22, 23, 24]
                                                                           else ( 10 if int(x) in [12, 15, 16, 17]
                                                                           else ( 11 if int(x) in [19, 20, 21]
                                                                           else ( 18 if int(x) in [25, 26] 
                                                                           else int(x) )))))

data = data[~data.discharge_disposition_id.isin([11,13,14,19,20,21])]

data['admission_type_id'] = data['admission_type_id'].apply(lambda x : 1 if int(x) in [2, 7]
                                                            else ( 5 if int(x) in [6, 8]
                                                            else int(x) ))

data['admission_source_id'] = data['admission_source_id'].apply(lambda x : 1 if int(x) in [2, 3]
                                                            else ( 4 if int(x) in [5, 6, 10, 22, 25]
                                                            else ( 9 if int(x) in [15, 17, 20, 21]
                                                            else ( 11 if int(x) in [13, 14]
                                                            else int(x) ))))

# Process chemical test data
for col in ["metformin", "repaglinide", "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton", "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone", "metformin-pioglitazone"]:
    data[col] = data[col].apply(lambda x : 10 if x == 'Up' 
                                              else ( -10 if x == 'Down'                                                          
                                              else ( 0 if x == 'Steady'
                                              else  -20)))

# Possible label
data['change'] = data['change'].apply(lambda x : 1 if x == 'Ch'
                                                 else -1)

# Possible label
data['diabetesMed'] = data['diabetesMed'].apply(lambda x : -1 if x == 'No'
                                                else 1)


data['max_glu_serum'] = data['max_glu_serum'].apply(lambda x : 200 if x == '>200' 
                                                            else ( 300 if x == '>300'                                                          
                                                            else ( 100 if x == 'Norm'
                                                            else  0)))

data['A1Cresult'] = data['A1Cresult'].apply(lambda x : 7 if x == '>7' 
                                                         else (8 if  x == '>8'                                                        
                                                         else ( 5 if x == 'Norm'
                                                         else  0)))

In [36]:
# Clean missing values on diag_1, diag_2, diag_3
common_diag_1 = Counter(list(data["diag_1"])).most_common(1)[0][0]
common_diag_2 = Counter(list(data["diag_2"])).most_common(1)[0][0]
common_diag_3 = Counter(list(data["diag_3"])).most_common(1)[0][0]

data["diag_1"] = data["diag_1"].fillna(common_diag_1)
data["diag_2"] = data["diag_2"].fillna(common_diag_2)
data["diag_3"] = data["diag_3"].fillna(common_diag_3)

In [61]:
#getting subset of data for race
missing_values = data[data['race'] == '?']
complete_values = data[data['race'] != '?']
training_y = complete_values['race']
training_X = complete_values.drop('race',axis=1)

training_y 
training_X

Unnamed: 0,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,A1Cresult,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,Female,5,5,18,1,1,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,0,-20,-20,-20,-20,-20,-20,-1,-1,NO
1,Female,15,1,1,7,3,?,59,0,18,0,0,0,276,250.01,255,9,0,-20,-20,-20,-20,-20,10,1,1,>30
2,Female,25,1,1,7,2,?,11,5,13,2,0,1,648,250,V27,6,0,-20,0,-20,-20,-20,-20,-1,1,NO
3,Male,35,1,1,7,2,?,44,1,16,0,0,0,8,250.43,403,7,0,-20,-20,-20,-20,-20,10,1,1,NO
4,Male,45,1,1,7,1,?,51,0,8,0,0,0,197,157,250,5,0,-20,0,-20,-20,-20,0,1,1,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,Male,75,1,2,7,3,?,51,0,16,0,0,0,250.13,291,458,9,8,0,-20,-20,-20,-20,-10,1,1,>30
101762,Female,85,1,2,4,5,?,33,3,18,0,0,1,560,276,787,9,0,-20,-20,-20,-20,-20,0,-1,1,NO
101763,Male,75,1,1,7,1,?,53,0,9,1,0,0,38,590,296,13,0,0,-20,-20,-20,-20,-10,1,1,NO
101764,Female,85,1,2,7,10,Surgery-General,45,2,21,0,0,1,996,285,998,9,0,-20,0,-20,0,-20,10,1,1,NO


In [52]:
# More preprocess data operations

# TODO: race, random forest to process 
rf = RandomForestClassifier(n_estimators = 100, max_depth=25, criterion = "gini", random_state = 23)
rf.fit(training_X, training_y)
predicted = rf.predict(data_testt)
data_testt['race'] = predicted
data_trainn['race'] = y
data2 = pd.concat([data_testt, data_trainn], axis = 0);
race_df = pd.DataFrame(data2.race.values, index = data2.index)

race_df.columns = ['race']
data_new = pd.merge(data, race_df, right_index = True, left_index = True)
# TODO: process mdeical specialty

# TODO: may or may not create a new feature from orginal dataset

# TODO: may or may not remove least significant features

ValueError: could not convert string to float: 'Female'

In [53]:
# high_frequency = ['InternalMedicine', 'Family/GeneralPractice', 'Cardiology', 'Surgery-General', 'Orthopedics', 'Orthopedics-Reconstructive', 
#                  'Emergency/Trauma', 'Urology','ObstetricsandGynecology','Psychiatry','Pulmonology ','Nephrology','Radiologist']

# low_frequency = ['Surgery-PlasticwithinHeadandNeck','Psychiatry-Addictive','Proctology','Dermatology','SportsMedicine','Speech','Perinatology',\
#                 'Neurophysiology','Resident','Pediatrics-Hematology-Oncology','Pediatrics-EmergencyMedicine','Dentistry','DCPTEAM','Psychiatry-Child/Adolescent',\
#                 'Pediatrics-Pulmonology','Surgery-Pediatric','AllergyandImmunology','Pediatrics-Neurology','Anesthesiology','Pathology','Cardiology-Pediatric',\
#                 'Endocrinology-Metabolism','PhysicianNotFound','Surgery-Colon&Rectal','OutreachServices',\
#                 'Surgery-Maxillofacial','Rheumatology','Anesthesiology-Pediatric','Obstetrics','Obsterics&Gynecology-GynecologicOnco']

# pediatrics = ['Pediatrics','Pediatrics-CriticalCare','Pediatrics-EmergencyMedicine','Pediatrics-Endocrinology','Pediatrics-Hematology-Oncology',\
#                'Pediatrics-Neurology','Pediatrics-Pulmonology', 'Anesthesiology-Pediatric', 'Cardiology-Pediatric', 'Surgery-Pediatric']

# psychic = ['Psychiatry-Addictive', 'Psychology', 'Psychiatry',  'Psychiatry-Child/Adolescent', 'PhysicalMedicineandRehabilitation', 'Osteopath']


# neurology = ['Neurology', 'Surgery-Neuro',  'Pediatrics-Neurology', 'Neurophysiology']


# surgery = ['Surgeon', 'Surgery-Cardiovascular', \
#           'Surgery-Cardiovascular/Thoracic', 'Surgery-Colon&Rectal', 'Surgery-General', 'Surgery-Maxillofacial', \
#              'Surgery-Plastic', 'Surgery-PlasticwithinHeadandNeck',  'Surgery-Thoracic',\
#              'Surgery-Vascular', 'SurgicalSpecialty', 'Podiatry']
             
# ungrouped = ['Endocrinology','Gastroenterology','Gynecology','Hematology','Hematology/Oncology','Hospitalist','InfectiousDiseases',\
#            'Oncology','Ophthalmology','Otolaryngology','Pulmonology','Radiology']


# missing = ['?']

In [54]:
# data['diag_1'].apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1)  
#                                         else ('circulatory' if int(float(x)) in range(390, 460) or int(float(x)) == 785
#                                         else     ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786
#                                         else     ('digestive'   if int(float(x)) in range(520, 580) or int(float(x)) == 787
#                                         else     ('diabetes'    if int(float(x)) == 250
#                                         else     ('injury'      if int(float(x)) in range(800, 1000)
#                                         else ('musculoskeletal' if int(float(x)) in range(710, 740)
#                                         else ('genitourinary'   if int(float(x)) in range(580, 630) or int(float(x)) == 788
#                                         else ('neoplasms'       if int(float(x)) in range(140, 240)
#                                         else ('pregnecy'        if int(float(x)) in range(630, 680)
#                                         else 'other'))))))))))

ValueError: could not convert string to float: '?'

In [16]:
# Output clean dataset
data.to_csv(path_or_buf=os.path.join("dataset", "clean_diabetic_dataset.csv"), index=False)

# Notes: Splitting data must be performed in main.py

0            Caucasian
1            Caucasian
2      AfricanAmerican
3            Caucasian
4            Caucasian
5            Caucasian
6            Caucasian
7            Caucasian
8            Caucasian
9            Caucasian
10     AfricanAmerican
11     AfricanAmerican
12           Caucasian
13           Caucasian
14     AfricanAmerican
15     AfricanAmerican
16     AfricanAmerican
17           Caucasian
18     AfricanAmerican
19                   ?
20                   ?
21                   ?
22     AfricanAmerican
23           Caucasian
24     AfricanAmerican
25               Other
26           Caucasian
27           Caucasian
28           Caucasian
29           Caucasian
30     AfricanAmerican
31     AfricanAmerican
32           Caucasian
33           Caucasian
35           Caucasian
36           Caucasian
37           Caucasian
38           Caucasian
39           Caucasian
40           Caucasian
41           Caucasian
42           Caucasian
43           Caucasian
45         