# Masinsko ucenje - projekat

Dataset: https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008
 
Klasifikacija - Predviđanje ponovne hospitalizacije pacijenta sa dijabetesom na osnovu podataka sa inicijalne hospitalizacije

1.   Priprema podataka

In [70]:
import pandas as pd
import sklearn as scikit
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

path = '..\dataset\diabetic_data.csv'
dataframe = pd.read_csv(path, low_memory=False, na_values=['?', 'Unknown/Invalid'])
pd.set_option('display.max_columns', None)
dataframe.head(3)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,,,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,,,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,,,11,5,13,2,0,1,648.0,250.0,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO


In [71]:
total_rows= dataframe.shape[0]
dataframe.shape

(101766, 50)

In [72]:
dataframe['readmitted'].value_counts()

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

In [73]:
missing = dataframe.isna()
missing_count = missing.sum()
treshold=0.2
to_drop=[]
for i in range(0,missing_count.count()):
    if(missing_count[i]/total_rows > treshold):
        to_drop.append(i)
for i in to_drop:
    print(i, dataframe.columns[i])
dataframe = dataframe.drop(dataframe.columns[to_drop], axis=1)
dataframe.shape

5 weight
10 payer_code
11 medical_specialty


(101766, 47)

In [74]:
dataframe.dropna(inplace=True)
dataframe.drop(dataframe[['encounter_id','patient_nbr','examide','citoglipton','metformin-rosiglitazone']], axis=1, inplace=True)

In [75]:
print("Number Of Rows In The Original DataFrame:", len(dataframe))
print("Number Of Rows After Deduping:", len(dataframe.drop_duplicates()))

Number Of Rows In The Original DataFrame: 98052
Number Of Rows After Deduping: 98052


In [76]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98052 entries, 1 to 101765
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      98052 non-null  object
 1   gender                    98052 non-null  object
 2   age                       98052 non-null  object
 3   admission_type_id         98052 non-null  int64 
 4   discharge_disposition_id  98052 non-null  int64 
 5   admission_source_id       98052 non-null  int64 
 6   time_in_hospital          98052 non-null  int64 
 7   num_lab_procedures        98052 non-null  int64 
 8   num_procedures            98052 non-null  int64 
 9   num_medications           98052 non-null  int64 
 10  number_outpatient         98052 non-null  int64 
 11  number_emergency          98052 non-null  int64 
 12  number_inpatient          98052 non-null  int64 
 13  diag_1                    98052 non-null  object
 14  diag_2               

In [77]:
age_scale_mapper= {
    '[0-10)':0,
    '[10-20)':1,
    '[20-30)':2,
    '[30-40)':3,
    '[40-50)':4,
    '[50-60)':5,
    '[60-70)':6,
    '[70-80)':7,
    '[80-90)':8,
    '[90-100)':9,
}
glu_scale_mapper = {
    'None':[0,0],
    'Norm':[1,70],
    '>200':[1,250],
    '>300':[1,350]
}
a1_scale_mapper = {
    'None':[0,0],
    'Norm':[1,4],
    '>7':[1,7],
    '>8':[1,9]
}
level_scale_mapper = {
    'No':[0,0,0,0],
    'Steady':[1,1,0,0],
    'Up':[1,0,1,0],
    'Down':[1,0,0,1]
}
class_mapper = {
    'NO':[0,0],
    '<30':[1,0],
    '>30':[1,1]
}

one_hot = LabelBinarizer()

dataframe['admission_type_id'] = dataframe['admission_type_id'].astype('category')
dataframe['discharge_disposition_id'] = dataframe['discharge_disposition_id'].astype('category')
dataframe['admission_source_id'] = dataframe['admission_source_id'].astype('category')

dataframe['age'] = dataframe['age'].replace(age_scale_mapper)
dataframe['gender'] = one_hot.fit_transform(dataframe['gender'])
dataframe['max_glu_serum'] = dataframe['max_glu_serum'].apply(lambda x: glu_scale_mapper[x])
dataframe['A1Cresult'] = dataframe['A1Cresult'].apply(lambda x: a1_scale_mapper[x])
dataframe['metformin'] = dataframe['metformin'].apply(lambda x: level_scale_mapper[x])
dataframe['repaglinide'] = dataframe['repaglinide'].apply(lambda x: level_scale_mapper[x])
dataframe['nateglinide'] = dataframe['nateglinide'].apply(lambda x: level_scale_mapper[x])
dataframe['chlorpropamide'] = dataframe['chlorpropamide'].apply(lambda x: level_scale_mapper[x])
dataframe['glimepiride'] = dataframe['glimepiride'].apply(lambda x: level_scale_mapper[x])
dataframe['acetohexamide'] = dataframe['acetohexamide'].apply(lambda x: level_scale_mapper[x])
dataframe['glipizide'] = dataframe['glipizide'].apply(lambda x: level_scale_mapper[x])
dataframe['glyburide'] = dataframe['glyburide'].apply(lambda x: level_scale_mapper[x])
dataframe['tolbutamide'] = dataframe['tolbutamide'].apply(lambda x: level_scale_mapper[x])
dataframe['pioglitazone'] = dataframe['pioglitazone'].apply(lambda x: level_scale_mapper[x])
dataframe['rosiglitazone'] = dataframe['rosiglitazone'].apply(lambda x: level_scale_mapper[x])
dataframe['acarbose'] = dataframe['acarbose'].apply(lambda x: level_scale_mapper[x])
dataframe['miglitol'] = dataframe['miglitol'].apply(lambda x: level_scale_mapper[x])
dataframe['troglitazone'] = dataframe['troglitazone'].apply(lambda x: level_scale_mapper[x])
dataframe['tolazamide'] = dataframe['tolazamide'].apply(lambda x: level_scale_mapper[x])
dataframe['insulin'] = dataframe['insulin'].apply(lambda x: level_scale_mapper[x])
dataframe['glyburide-metformin'] = dataframe['glyburide-metformin'].apply(lambda x: level_scale_mapper[x])
dataframe['glipizide-metformin'] = dataframe['glipizide-metformin'].apply(lambda x: level_scale_mapper[x])
dataframe['glimepiride-pioglitazone'] = dataframe['glimepiride-pioglitazone'].apply(lambda x: level_scale_mapper[x])
dataframe['metformin-pioglitazone'] = dataframe['metformin-pioglitazone'].apply(lambda x: level_scale_mapper[x])
dataframe['change'] = one_hot.fit_transform(dataframe['change'])
dataframe['diabetesMed'] = one_hot.fit_transform(dataframe['diabetesMed'])
dataframe['readmitted'] = dataframe['readmitted'].apply(lambda x: class_mapper[x])




In [78]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98052 entries, 1 to 101765
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      98052 non-null  object
 1   gender                    98052 non-null  int32 
 2   age                       98052 non-null  int64 
 3   admission_type_id         98052 non-null  int64 
 4   discharge_disposition_id  98052 non-null  int32 
 5   admission_source_id       98052 non-null  int32 
 6   time_in_hospital          98052 non-null  int64 
 7   num_lab_procedures        98052 non-null  int64 
 8   num_procedures            98052 non-null  int64 
 9   num_medications           98052 non-null  int64 
 10  number_outpatient         98052 non-null  int64 
 11  number_emergency          98052 non-null  int64 
 12  number_inpatient          98052 non-null  int64 
 13  diag_1                    98052 non-null  object
 14  diag_2               

In [79]:
print(dataframe.shape)
dataframe.head()

(98052, 42)


Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,Caucasian,0,1,1,1,0,3,59,0,18,0,0,0,276,250.01,255,9,"[0, 0]","[0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[1, 0, 1, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]",0,1,"[1, 1]"
2,AfricanAmerican,0,2,1,1,0,2,11,5,13,2,0,1,648,250.0,V27,6,"[0, 0]","[0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[1, 1, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]",1,1,"[0, 0]"
3,Caucasian,1,3,1,1,0,2,44,1,16,0,0,0,8,250.43,403,7,"[0, 0]","[0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[1, 0, 1, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]",0,1,"[0, 0]"
4,Caucasian,1,4,1,1,0,1,51,0,8,0,0,0,197,157.0,250,5,"[0, 0]","[0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[1, 1, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[1, 1, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]",0,1,"[0, 0]"
5,Caucasian,1,5,2,1,0,3,31,6,16,0,0,0,414,411.0,250,9,"[0, 0]","[0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[1, 1, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0, 0]",1,1,"[1, 1]"


In [80]:
lb= LabelBinarizer()

test = np.array([1,1,2,45,6,3,2,5,2,1,2])
test
lb.fit_transform(test)
dataframe['admission_type_id'].value_counts()
one_hot.fit_transform(dataframe['admission_type_id'])

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

2.   Deskriptivna analiza