# Masinsko ucenje - projekat

Dataset: https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008
 
Klasifikacija - Predviđanje ponovne hospitalizacije pacijenta sa dijabetesom na osnovu podataka sa inicijalne hospitalizacije

1.   Priprema podataka

In [1]:
import pandas as pd
import sklearn as scikit
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

In [2]:
# path = '..\dataset\diabetic_data.csv'
path = "../dataset/diabetic_data.csv"
dataframe = pd.read_csv(path, low_memory=False, na_values=["?", "Unknown/Invalid"])
pd.set_option("display.max_columns", None)
dataframe.head(3)


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,,,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,,,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,,,11,5,13,2,0,1,648.0,250.0,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO


In [3]:
total_rows = dataframe.shape[0]
dataframe.shape


(101766, 50)

In [4]:
dataframe["readmitted"].value_counts()


NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

In [5]:
missing = dataframe.isna()
missing_count = missing.sum()
treshold = 0.2
to_drop = []
for i in range(0, missing_count.count()):
    if missing_count[i] / total_rows > treshold:
        to_drop.append(i)
for i in to_drop:
    print(i, dataframe.columns[i])
dataframe = dataframe.drop(dataframe.columns[to_drop], axis=1)
dataframe.shape


5 weight
10 payer_code
11 medical_specialty


(101766, 47)

In [6]:
dataframe.dropna(inplace=True)
dataframe.drop(
    dataframe[
        [
            "encounter_id",
            "patient_nbr",
            "examide",
            "citoglipton",
            "metformin-rosiglitazone",
        ]
    ],
    axis=1,
    inplace=True,
)


In [7]:
print("Number Of Rows In The Original DataFrame:", len(dataframe))
print("Number Of Rows After Deduping:", len(dataframe.drop_duplicates()))


Number Of Rows In The Original DataFrame: 98052
Number Of Rows After Deduping: 98052


In [8]:
dataframe.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 98052 entries, 1 to 101765
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      98052 non-null  object
 1   gender                    98052 non-null  object
 2   age                       98052 non-null  object
 3   admission_type_id         98052 non-null  int64 
 4   discharge_disposition_id  98052 non-null  int64 
 5   admission_source_id       98052 non-null  int64 
 6   time_in_hospital          98052 non-null  int64 
 7   num_lab_procedures        98052 non-null  int64 
 8   num_procedures            98052 non-null  int64 
 9   num_medications           98052 non-null  int64 
 10  number_outpatient         98052 non-null  int64 
 11  number_emergency          98052 non-null  int64 
 12  number_inpatient          98052 non-null  int64 
 13  diag_1                    98052 non-null  object
 14  diag_2               

Diagnose mapping http://icd9cm.chrisendres.com/index.php?action=contents

In [9]:
def diag_mapper(value: str) -> int:
    try:
        int_value = int(float(value))
        if int_value <= 139:
            return 0
        elif int_value <= 239:
            return 1
        elif int_value <= 279:
            return 2
        elif int_value <= 289:
            return 3
        elif int_value <= 319:
            return 4
        elif int_value <= 389:
            return 5
        elif int_value <= 459:
            return 6
        elif int_value <= 519:
            return 7
        elif int_value <= 579:
            return 8
        elif int_value <= 629:
            return 9
        elif int_value <= 679:
            return 10
        elif int_value <= 709:
            return 11
        elif int_value <= 739:
            return 12
        elif int_value <= 759:
            return 13
        elif int_value <= 779:
            return 14
        elif int_value <= 799:
            return 15
        else:
            return 16
    except ValueError:
        if value[0] == 'V':
            return 17
        else:  # E
            return 18


dataframe['diag_1'] = dataframe['diag_1'].apply(lambda x: diag_mapper(x))
dataframe['diag_2'] = dataframe['diag_2'].apply(lambda x: diag_mapper(x))
dataframe['diag_3'] = dataframe['diag_3'].apply(lambda x: diag_mapper(x))


In [10]:
age_scale_mapper = {
    '[0-10)': 0,
    '[10-20)': 1,
    '[20-30)': 2,
    '[30-40)': 3,
    '[40-50)': 4,
    '[50-60)': 5,
    '[60-70)': 6,
    '[70-80)': 7,
    '[80-90)': 8,
    '[90-100)': 9,
}
glu_scale_mapper = {
    'None': 70,
    'Norm': 70,
    '>200': 250,
    '>300': 350
}
a1_scale_mapper = {
    'None': 4,
    'Norm': 4,
    '>7': 7,
    '>8': 9,
}
level_scale_mapper = {
    'No': 0,
    'Steady': 1,
    'Up': 2,
    'Down': 3
}
class_mapper = {
    'NO': 0,
    '<30': 1,
    '>30': 2
}
race_mapper = {
    "Other": 0,
    "Caucasian": 1,
    "AfricanAmerican": 2,
    "Hispanic": 3,
    "Asian": 4,
}


one_hot = LabelBinarizer()

dataframe['admission_type_id'] = dataframe['admission_type_id'].astype(
    'category')
dataframe['discharge_disposition_id'] = dataframe['discharge_disposition_id'].astype(
    'category')
dataframe['admission_source_id'] = dataframe['admission_source_id'].astype(
    'category')


dataframe['race'] = dataframe['race'].replace(race_mapper)
dataframe['age'] = dataframe['age'].replace(age_scale_mapper)
dataframe['gender'] = one_hot.fit_transform(dataframe['gender'])
dataframe['max_glu_serum'] = dataframe['max_glu_serum'].replace(
    glu_scale_mapper)
dataframe['A1Cresult'] = dataframe['A1Cresult'].replace(a1_scale_mapper)
dataframe['metformin'] = dataframe['metformin'].replace(level_scale_mapper)
dataframe['repaglinide'] = dataframe['repaglinide'].replace(level_scale_mapper)
dataframe['nateglinide'] = dataframe['nateglinide'].replace(level_scale_mapper)
dataframe['chlorpropamide'] = dataframe['chlorpropamide'].replace(
    level_scale_mapper)
dataframe['glimepiride'] = dataframe['glimepiride'].replace(level_scale_mapper)
dataframe['acetohexamide'] = dataframe['acetohexamide'].replace(
    level_scale_mapper)
dataframe['glipizide'] = dataframe['glipizide'].replace(level_scale_mapper)
dataframe['glyburide'] = dataframe['glyburide'].replace(level_scale_mapper)
dataframe['tolbutamide'] = dataframe['tolbutamide'].replace(level_scale_mapper)
dataframe['pioglitazone'] = dataframe['pioglitazone'].replace(
    level_scale_mapper)
dataframe['rosiglitazone'] = dataframe['rosiglitazone'].replace(
    level_scale_mapper)
dataframe['acarbose'] = dataframe['acarbose'].replace(level_scale_mapper)
dataframe['miglitol'] = dataframe['miglitol'].replace(level_scale_mapper)
dataframe['troglitazone'] = dataframe['troglitazone'].replace(
    level_scale_mapper)
dataframe['tolazamide'] = dataframe['tolazamide'].replace(level_scale_mapper)
dataframe['insulin'] = dataframe['insulin'].replace(level_scale_mapper)
dataframe['glyburide-metformin'] = dataframe['glyburide-metformin'].replace(
    level_scale_mapper)
dataframe['glipizide-metformin'] = dataframe['glipizide-metformin'].replace(
    level_scale_mapper)
dataframe['glimepiride-pioglitazone'] = dataframe['glimepiride-pioglitazone'].replace(
    level_scale_mapper)
dataframe['metformin-pioglitazone'] = dataframe['metformin-pioglitazone'].replace(
    level_scale_mapper)
dataframe['change'] = one_hot.fit_transform(dataframe['change'])
dataframe['diabetesMed'] = one_hot.fit_transform(dataframe['diabetesMed'])
dataframe['readmitted'] = dataframe['readmitted'].replace(class_mapper)


In [11]:
dataframe.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 98052 entries, 1 to 101765
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   race                      98052 non-null  int64   
 1   gender                    98052 non-null  int64   
 2   age                       98052 non-null  int64   
 3   admission_type_id         98052 non-null  category
 4   discharge_disposition_id  98052 non-null  category
 5   admission_source_id       98052 non-null  category
 6   time_in_hospital          98052 non-null  int64   
 7   num_lab_procedures        98052 non-null  int64   
 8   num_procedures            98052 non-null  int64   
 9   num_medications           98052 non-null  int64   
 10  number_outpatient         98052 non-null  int64   
 11  number_emergency          98052 non-null  int64   
 12  number_inpatient          98052 non-null  int64   
 13  diag_1                    98052 non-null  int

In [12]:
print(dataframe.shape)
dataframe.head(10)


(98052, 42)


Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,1,0,1,1,1,7,3,59,0,18,0,0,0,2,2,2,9,70,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,2
2,2,0,2,1,1,7,2,11,5,13,2,0,1,10,2,17,6,70,4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
3,1,1,3,1,1,7,2,44,1,16,0,0,0,0,2,6,7,70,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0
4,1,1,4,1,1,7,1,51,0,8,0,0,0,1,1,2,5,70,4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
5,1,1,5,2,1,2,3,31,6,16,0,0,0,6,6,2,9,70,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,2
6,1,1,6,3,1,2,4,70,1,21,0,0,0,6,6,17,7,70,4,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
7,1,1,7,1,1,7,5,73,0,12,0,0,0,6,7,2,8,70,4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2
8,1,0,8,2,1,4,13,68,2,28,0,0,0,6,6,0,8,70,4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
9,1,0,9,3,3,4,12,33,3,18,0,0,0,6,1,7,8,70,4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0
10,2,0,4,1,1,7,9,47,2,17,0,0,0,2,6,16,9,70,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,2


In [13]:
lb = LabelBinarizer()

test = np.array([1, 1, 2, 45, 6, 3, 2, 5, 2, 1, 2])
test
lb.fit_transform(test)
dataframe["admission_type_id"].value_counts()
one_hot.fit_transform(dataframe["admission_type_id"])


array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

2.   Deskriptivna analiza