In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [5]:
df = pd.read_csv('../diabetic_data.csv')
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [6]:
df.shape

(101766, 50)

In [7]:
df.drop('encounter_id', axis=1, inplace=True)

In [8]:
df.isnull().sum()

patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide     

In [9]:
# replacing '?' wiht NaN
df.replace(regex=r'\?', value=np.nan, inplace=True)

In [10]:
df.isnull().sum()

patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide     

In [11]:
# response variable 'readmitted', <30 -> 0, >30 1
df['readmitted'] = df['readmitted'].replace({'<30': 1, '>30': 0, 'NO': 0})

  df['readmitted'] = df['readmitted'].replace({'<30': 1, '>30': 0, 'NO': 0})


In [12]:
df.dtypes

patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide       

In [13]:
missing_values = df.isnull().sum()

# Create a DataFrame with only the columns containing missing values
missing_data_table = pd.DataFrame({'Column Name': df.columns,
                                   'Missing Values': missing_values[df.columns],
                                   'Missing Percentage': (missing_values[df.columns] / len(df)) * 100})

# Display the missing data table
print(missing_data_table.to_string(index=False))

             Column Name  Missing Values  Missing Percentage
             patient_nbr               0            0.000000
                    race            2273            2.233555
                  gender               0            0.000000
                     age               0            0.000000
                  weight           98569           96.858479
       admission_type_id               0            0.000000
discharge_disposition_id               0            0.000000
     admission_source_id               0            0.000000
        time_in_hospital               0            0.000000
              payer_code           40256           39.557416
       medical_specialty           49949           49.082208
      num_lab_procedures               0            0.000000
          num_procedures               0            0.000000
         num_medications               0            0.000000
       number_outpatient               0            0.000000
        number_emergency

In [14]:
# dropping columns with large number of missing values
df = df.drop(['weight'], axis=1)

In [15]:
# Some columns have no variations. The variables'examide'and'citoglipton'have only one value.
# These columns are not useful in prediction and can be deleted. Delete the following near zero-variance colums:
# no variations in cols. these cols are not useful in prediction and can be deleted
cols_to_delete = ['repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'tolbutamide',
                  'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'glyburide-metformin',
                  'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone',
                  'metformin-pioglitazone']
# Drop multiple columns
df.drop(columns=cols_to_delete, inplace=True)
df.columns

Index(['patient_nbr', 'race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'payer_code', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin',
       'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [16]:
df.dropna(inplace=True)

In [17]:
df.shape

(0, 31)

In [18]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
patient_nbr,0.0,,,,,,,
admission_type_id,0.0,,,,,,,
discharge_disposition_id,0.0,,,,,,,
admission_source_id,0.0,,,,,,,
time_in_hospital,0.0,,,,,,,
num_lab_procedures,0.0,,,,,,,
num_procedures,0.0,,,,,,,
num_medications,0.0,,,,,,,
number_outpatient,0.0,,,,,,,
number_emergency,0.0,,,,,,,


In [19]:
# identify outliers using IQR
def identify_outliers(column):
    q1 = column.quantile(0.25)
    q3 = column.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = column[(column < lower_bound) | (column > upper_bound)]
    return outliers


cols_to_remove_outliers = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications',
                           'number_outpatient', 'number_inpatient', 'number_emergency', 'number_diagnoses']

# Iterate over each numerical column and identify outliers
for column in cols_to_remove_outliers:
    outliers = identify_outliers(df[column])
    print(f"Outliers in {column}:")
    print(outliers)

Outliers in time_in_hospital:
Series([], Name: time_in_hospital, dtype: int64)
Outliers in num_lab_procedures:
Series([], Name: num_lab_procedures, dtype: int64)
Outliers in num_procedures:
Series([], Name: num_procedures, dtype: int64)
Outliers in num_medications:
Series([], Name: num_medications, dtype: int64)
Outliers in number_outpatient:
Series([], Name: number_outpatient, dtype: int64)
Outliers in number_inpatient:
Series([], Name: number_inpatient, dtype: int64)
Outliers in number_emergency:
Series([], Name: number_emergency, dtype: int64)
Outliers in number_diagnoses:
Series([], Name: number_diagnoses, dtype: int64)


In [20]:
# identify and remove outliers using IQR
def remove_outliers(df):
    # Iterate over each numerical column and identify outliers
    for column in cols_to_remove_outliers:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        # Filter the DataFrame to exclude outliers
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df


# Remove outliers from the DataFrame
df = remove_outliers(df)

In [21]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
patient_nbr,0.0,,,,,,,
admission_type_id,0.0,,,,,,,
discharge_disposition_id,0.0,,,,,,,
admission_source_id,0.0,,,,,,,
time_in_hospital,0.0,,,,,,,
num_lab_procedures,0.0,,,,,,,
num_procedures,0.0,,,,,,,
num_medications,0.0,,,,,,,
number_outpatient,0.0,,,,,,,
number_emergency,0.0,,,,,,,


In [22]:
df.shape

(0, 31)

In [23]:
# unique patients
len(df.patient_nbr.unique())

0

In [24]:
# remove duplicates
df.drop_duplicates(['patient_nbr'], keep='first', inplace=True)
len(df)

0

In [25]:
# convert cat age to num
replace_dict = {'[0-10)': 5,
                '[10-20)': 15,
                '[20-30)': 25,
                '[30-40)': 35,
                '[40-50)': 45,
                '[50-60)': 55,
                '[60-70)': 65,
                '[70-80)': 75,
                '[80-90)': 85,
                '[90-100)': 95}

df['age'] = df['age'].apply(lambda x: replace_dict[x])

In [26]:
cleaned_df = df

# Define the file path where you want to save the CSV file
file_path = 'cleaned_data.csv'

# Save the DataFrame as a CSV file
cleaned_df.to_csv(file_path, index=False)