In [92]:
import pandas as pd
import numpy as np
import re

In [93]:
diabetic_data = pd.read_csv('data/diabetic_data.csv')
diabetic_data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [94]:
diabetic_data.sample(25)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
75541,226748574,102079377,Caucasian,Male,[60-70),[100-125),3,1,1,1,...,No,No,No,No,No,No,No,No,Yes,<30
65835,183770118,44127414,Caucasian,Female,[50-60),?,1,1,7,3,...,No,No,No,No,No,No,No,Ch,Yes,>30
28336,92864526,6866964,?,Male,[60-70),?,2,1,1,3,...,No,Steady,No,No,No,No,No,No,Yes,NO
83134,259861338,58474134,Caucasian,Female,[80-90),?,1,3,7,7,...,No,Steady,No,No,No,No,No,No,Yes,NO
47583,146111946,82666917,Caucasian,Male,[0-10),?,2,1,1,6,...,No,Up,No,No,No,No,No,Ch,Yes,>30
52315,154994880,3447360,Caucasian,Male,[70-80),?,6,1,17,1,...,No,No,No,No,No,No,No,No,Yes,NO
62869,174988638,38989323,AfricanAmerican,Male,[80-90),?,1,4,2,9,...,No,No,No,No,No,No,No,No,No,NO
12843,51834210,75627180,AfricanAmerican,Female,[60-70),?,1,1,7,4,...,No,No,No,No,No,No,No,No,Yes,>30
5613,28961514,108341001,Caucasian,Male,[70-80),?,1,1,7,3,...,No,Steady,No,No,No,No,No,No,Yes,<30
26681,87752796,24105465,Caucasian,Female,[80-90),?,1,14,17,8,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [95]:
# weight has to many np.nan
# payer_code is not related to diabetic data
drop_columns = ['weight', 'payer_code']

In [96]:
dropped_diabetic_data = diabetic_data.drop(drop_columns, axis=1)
dropped_diabetic_data.sample(25)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
32481,103517430,78510465,Caucasian,Female,[50-60),1,1,7,1,InternalMedicine,...,No,Down,No,No,No,No,No,Ch,Yes,>30
75391,225883482,28770885,Caucasian,Male,[60-70),2,6,1,4,?,...,No,Steady,No,No,No,No,No,Ch,Yes,>30
78129,238248402,65029725,Caucasian,Female,[20-30),2,1,1,3,?,...,No,Steady,No,No,No,No,No,No,Yes,NO
90978,303152948,42405741,Caucasian,Female,[60-70),3,3,1,5,Nephrology,...,No,Down,No,No,No,No,No,Ch,Yes,>30
37328,115613730,68268114,Caucasian,Male,[70-80),6,1,1,6,?,...,No,No,No,No,No,No,No,No,Yes,>30
97744,392538788,138719228,Caucasian,Male,[70-80),1,6,7,4,?,...,No,Down,No,No,No,No,No,Ch,Yes,NO
74163,220619442,96421806,Caucasian,Male,[80-90),2,3,7,1,Family/GeneralPractice,...,No,No,No,No,No,No,No,No,No,>30
2523,16416120,1123560,Caucasian,Female,[40-50),1,1,7,6,InternalMedicine,...,No,Up,No,No,No,No,No,Ch,Yes,NO
69164,196213362,59583888,Other,Male,[30-40),1,1,7,1,?,...,No,Down,No,No,No,No,No,Ch,Yes,>30
61467,171586554,102482199,Caucasian,Male,[70-80),2,1,7,1,Emergency/Trauma,...,No,Down,No,No,No,No,No,Ch,Yes,NO


In [97]:
# there are some values set as '?'
# will replace them with NAN to be consistent with dataset
# dropped_diabetic_data.replace(to_replace='?', value=np.nan, inplace=True)

In [98]:
dropped_diabetic_data['metformin'].value_counts()
# Down: -1
# No: 0
# Up: 1
# Steady: 2

No        81778
Steady    18346
Up         1067
Down        575
Name: metformin, dtype: int64

In [99]:
dropped_diabetic_data['repaglinide'].value_counts()

No        100227
Steady      1384
Up           110
Down          45
Name: repaglinide, dtype: int64

In [100]:
dropped_diabetic_data['nateglinide'].value_counts()

No        101063
Steady       668
Up            24
Down          11
Name: nateglinide, dtype: int64

In [101]:
dropped_diabetic_data['metformin'] = dropped_diabetic_data['metformin'].replace({'Down': -1, 'No': 0, 'Up': 1, 'Steady': 2})
dropped_diabetic_data['repaglinide'] = dropped_diabetic_data['repaglinide'].replace({'Down': -1, 'No': 0, 'Up': 1, 'Steady': 2})
dropped_diabetic_data['nateglinide'] = dropped_diabetic_data['nateglinide'].replace({'Down': -1, 'No': 0, 'Up': 1, 'Steady': 2})

In [102]:
dropped_diabetic_data[['metformin','repaglinide','nateglinide']].sample(25)

Unnamed: 0,metformin,repaglinide,nateglinide
60683,0,0,0
75006,0,0,0
37035,2,0,0
80171,0,0,0
18978,0,0,0
29803,0,0,0
24783,0,0,0
60225,0,0,0
77257,0,0,0
100775,0,0,0


In [103]:
dropped_diabetic_data[['metformin','repaglinide','nateglinide']].dtypes

metformin      int64
repaglinide    int64
nateglinide    int64
dtype: object

In [104]:
dropped_diabetic_data['change'].value_counts()
# No = 0
# Ch = 1

No    54755
Ch    47011
Name: change, dtype: int64

In [105]:
dropped_diabetic_data['change'] = dropped_diabetic_data['change'].replace({'No': 0, 'Ch': 1})

In [106]:
dropped_diabetic_data['diabetesMed'].value_counts()
# No = 0
# Yes = 1

Yes    78363
No     23403
Name: diabetesMed, dtype: int64

In [107]:
dropped_diabetic_data['diabetesMed'] = dropped_diabetic_data['diabetesMed'].replace({'No': 0, 'Yes': 1})

In [108]:
dropped_diabetic_data['max_glu_serum'].value_counts()
# 'None': 0, 
# 'Norm': 1, 
# '>200': 2, 
# '>300': 3

None    96420
Norm     2597
>200     1485
>300     1264
Name: max_glu_serum, dtype: int64

In [109]:
dropped_diabetic_data['max_glu_serum'] = dropped_diabetic_data['max_glu_serum'].replace({'None': 0, 'Norm': 1, '>200': 2, '>300': 3})

In [110]:
dropped_diabetic_data['A1Cresult'].value_counts()
# 'None': 0, 'Norm': 1, '>7': 2, '>8': 3

None    84748
>8       8216
Norm     4990
>7       3812
Name: A1Cresult, dtype: int64

In [111]:
dropped_diabetic_data['A1Cresult'] = dropped_diabetic_data['A1Cresult'].replace({'None': 0, 'Norm': 1, '>7': 2, '>8': 3})

In [112]:
dropped_diabetic_data['readmitted'].value_counts()
# NO = 0
# <30 = 1
# >30 = 2

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

In [113]:
dropped_diabetic_data['readmitted'] = dropped_diabetic_data['readmitted'].replace({'NO': 0, '<30': 1, '>30': 2})

In [114]:
diag = ['diag_1', 'diag_2', 'diag_3']
dropped_diabetic_data[diag].sample(25)

Unnamed: 0,diag_1,diag_2,diag_3
1870,414.0,411,428
45494,428.0,427,496
58401,491.0,438,438
412,820.0,285,250.01
13296,414.0,V45,250
2576,410.0,427,250
17555,786.0,780,414
57103,507.0,401,414
46403,153.0,196,280
83013,584.0,250.22,204
