##### imports

In [23]:
%matplotlib inline
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Model Training
## Classification

##### read data

In [2]:
diabetes_data = pd.read_csv('data/diabetic_data.csv')
diabetes_data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


##### check all dtypes

In [3]:
diabetes_data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

##### separate attributes and targets from the dataset

In [4]:
diabetes_data_attr = diabetes_data.drop('readmitted', axis=1)
diabetes_data_trg = diabetes_data['readmitted']

##### make attr One-Hot Encoding for Categorical Data (pandas.get_dummies)

In [5]:
diabetes_data_attr =pd.get_dummies(diabetes_data_attr)

##### check the new shape of the data

In [6]:
diabetes_data_attr.shape

(101766, 2472)

##### scaling the attributes

In [7]:
scalar = MinMaxScaler()
diabetes_data_attr_scaled = scalar.fit_transform(diabetes_data_attr)

##### check the scaling is ok -> min must be 0 and max must be 1

In [8]:
diabetes_data_attr_scaled.min(axis=0), diabetes_data_attr_scaled.max(axis=0)

(array([0., 0., 0., ..., 0., 0., 0.]), array([1., 1., 1., ..., 1., 1., 1.]))

##### create logistic regression model with C=1e9

In [9]:
logr_model = LogisticRegression(C=10e9, solver = "liblinear", max_iter=10000)
logr_model.fit(diabetes_data_attr_scaled, diabetes_data_trg)

LogisticRegression(C=10000000000.0, max_iter=10000, solver='liblinear')

##### check the score

In [10]:
logr_model.score(diabetes_data_attr_scaled, diabetes_data_trg)

0.6026669025018179

##### create logistic regression model with C=1

In [11]:
logr_model = LogisticRegression(C=1, solver = "liblinear", max_iter=10000)
logr_model.fit(diabetes_data_attr_scaled, diabetes_data_trg)

LogisticRegression(C=1, max_iter=10000, solver='liblinear')

##### check the score

In [12]:
logr_model.score(diabetes_data_attr_scaled, diabetes_data_trg)

0.6000039305858538

##### create logistic regression model with C=0.001

In [13]:
logr_model = LogisticRegression(C=0.001, solver = "liblinear", max_iter=10000)
logr_model.fit(diabetes_data_attr_scaled, diabetes_data_trg)

LogisticRegression(C=0.1, max_iter=10000, solver='liblinear')

##### check the score

In [14]:
logr_model.score(diabetes_data_attr_scaled, diabetes_data_trg)

0.594540416249042

##### split the data for train test and set train_size=0.7 (70% form the data)

In [15]:
diab_attr_train, diab_attr_test, diab_trg_train, diab_trg_test = train_test_split(
                                                                    diabetes_data_attr_scaled, 
                                                                    diabetes_data_trg, 
                                                                    test_size=0.1,
                                                                    random_state=30,
                                                                    stratify=diabetes_data_trg)

##### check the split data shape

In [16]:
diab_attr_train.shape, diab_attr_test.shape

((91589, 2472), (10177, 2472))

In [17]:
diab_trg_train.shape, diab_trg_test.shape

((91589,), (10177,))

##### train the model with data that its have been seen

In [18]:
logr_model_tt = LogisticRegression(C=5, solver = "liblinear", max_iter=10000)
logr_model_tt.fit(diab_attr_train, diab_trg_train)

LogisticRegression(C=5, max_iter=10000, solver='liblinear')

##### check the score from the train

In [19]:
logr_model_tt.score(diab_attr_train, diab_trg_train)

0.6040245007588247

##### ckeck the score with data that its not have been seen

In [20]:
logr_model_tt.score(diab_attr_test, diab_trg_test)

0.5785594969047853

##### Let compare the predicted vals and the original vals (report)

In [21]:
diabetes_predicted = logr_model_tt.predict(diab_attr_test)

In [22]:
print(classification_report(diab_trg_test, diabetes_predicted))

              precision    recall  f1-score   support

         <30       0.33      0.02      0.03      1136
         >30       0.49      0.36      0.42      3555
          NO       0.61      0.84      0.71      5486

    accuracy                           0.58     10177
   macro avg       0.48      0.40      0.38     10177
weighted avg       0.54      0.58      0.53     10177



In [24]:
params = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'fit_intercept': [True, False],
    'max_iter': 10000
}

In [None]:
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=params)
grid_search.fit(diab_attr_train, diab_trg_train)