In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('diabetic_data.csv')

In [8]:
# Understand the data
print(df.shape)
print(df.columns)

(101766, 50)
Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')


In [9]:
# Check for any missing data
print(df.isnull().sum())


encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [12]:
print(df.head())

   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No      No                   No

In [14]:
print(df['readmitted'].value_counts())

readmitted
NO     54864
>30    35545
<30    11357
Name: count, dtype: int64


In [20]:
df['time_in_hospital'].dtypes
# time_in_hospital should be number -> int64

dtype('int64')

In [27]:
# check for missing values
missing = df.isnull().sum()

# check for missing percentage
missing_pct = (missing / len(df)) * 100

# step 1: create a new dataframe with missing counts and percentage named missing_data
missing_data = pd.DataFrame({
    'Missing_Count': missing, 'Percentage': missing_pct
})
# step 2: filter to show only columns in missing_data df that have missing data > 0
# step 3: sort by percentage (highest first) using sort_values()
print(missing_data[missing_data['Missing_Count'] > 0].sort_values('Percentage',ascending=False))


               Missing_Count  Percentage
max_glu_serum          96420   94.746772
A1Cresult              84748   83.277322


In [28]:
# Cleaning rule: 
# if a column is >70% missing -> drop it
# df.drop(['col1','col2'],axis=1) -> remove entire col1, col2
# df.drop([0,5,10],axis=0) -> drop these rows 0,5,10

df = df.drop(['max_glu_serum', 'A1Cresult'],axis=1)
print(f"New shape: {df.shape}")

New shape: (101766, 48)


In [30]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"Num of duplicate rows: {duplicates}")

# if there are duplicate
# df.duplicated() : return true/fase for each row
# keep = False : marks all copies as duplicates -> see it all 
# df[...] : filter the dataframe to show only rows that have duplicates
if duplicates > 0:
    print(df[df.duplicated(keep=False)].head())

Num of duplicate rows: 0


In [36]:
# Check for outliers and impossible values
# age -> !impossible ~(0-100)
print(df['age'].value_counts().sort_index())

# time : should be positive, reasonable -> yup
print(df['time_in_hospital'].describe())

# gender:
print(df['gender'].value_counts())

# race
print(df['race'].value_counts())




age
[0-10)        161
[10-20)       691
[20-30)      1657
[30-40)      3775
[40-50)      9685
[50-60)     17256
[60-70)     22483
[70-80)     26068
[80-90)     17197
[90-100)     2793
Name: count, dtype: int64
count    101766.000000
mean          4.395987
std           2.985108
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          14.000000
Name: time_in_hospital, dtype: float64
gender
Female             54708
Male               47055
Unknown/Invalid        3
Name: count, dtype: int64
race
Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: count, dtype: int64


In [37]:
# There are 3 unknown in gender. Since the unknown is way too small comparing to the dataset 
# drop those 3 rows
# 
df = df[df['gender'] != 'Unknown/Invalid']

In [38]:
# For race: 2273 is significant, replace ? with unknown
df['race'] = df['race'].replace('?','Unknown')

# after cleaning dataframe
print(df['race'].value_counts())


race
Caucasian          76099
AfricanAmerican    19210
Unknown             2271
Hispanic            2037
Other               1505
Asian                641
Name: count, dtype: int64


In [39]:
df.to_csv('diabetic_data_cleaned.csv', index=False)
print("Cleaned data done!")

Cleaned data done!
