In [1]:
%matplotlib inline

In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer

from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression, Lasso, Ridge, ElasticNet

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import classification_report, roc_curve

In [3]:
eps_c = 1e-10

In [4]:
diabetes_data = pd.read_csv("data/diabetic_data.csv")

In [5]:
diabetes_data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [6]:
diabetes_data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [7]:
diabetes_data.columns[diabetes_data.columns.str.contains("id")]

Index(['encounter_id', 'admission_type_id', 'discharge_disposition_id',
       'admission_source_id', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'tolazamide', 'examide', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone'],
      dtype='object')

In [8]:
diabetes_data.pioglitazone.unique()

array(['No', 'Steady', 'Up', 'Down'], dtype=object)

In [9]:
pd.get_dummies(diabetes_data)

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,...,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,readmitted_<30,readmitted_>30,readmitted_NO
0,2278392,8222157,6,25,1,1,41,0,1,0,...,False,True,False,False,True,True,False,False,False,True
1,149190,55629189,1,1,7,3,59,0,18,0,...,False,True,False,True,False,False,True,False,True,False
2,64410,86047875,1,1,7,2,11,5,13,2,...,False,True,False,False,True,False,True,False,False,True
3,500364,82442376,1,1,7,2,44,1,16,0,...,False,True,False,True,False,False,True,False,False,True
4,16680,42519267,1,1,7,1,51,0,8,0,...,False,True,False,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,1,3,7,3,51,0,16,0,...,False,True,False,True,False,False,True,False,True,False
101762,443847782,74694222,1,4,5,5,33,3,18,0,...,False,True,False,False,True,False,True,False,False,True
101763,443854148,41088789,1,1,7,1,53,0,9,1,...,False,True,False,True,False,False,True,False,False,True
101764,443857166,31693671,2,3,7,10,45,2,21,0,...,False,True,False,True,False,False,True,False,False,True


In [10]:
diabetes_attributes = diabetes_data.drop(columns="readmitted")
diabetes_target = diabetes_data.readmitted
diabetes_attributes_dummies = pd.get_dummies(diabetes_attributes)

In [11]:
logistinc_reg = LogisticRegression()

In [12]:
logistinc_reg.fit(diabetes_attributes_dummies, diabetes_target)

In [13]:
logistinc_reg.coef_

array([[-3.31701395e-09, -3.86905107e-09, -1.15814534e-15, ...,
        -3.26369841e-16, -1.63318872e-16, -3.02903976e-16],
       [-1.50329991e-10,  4.51003945e-09, -1.47130836e-16, ...,
        -1.14848644e-16, -1.11876233e-16,  7.41720589e-17],
       [ 3.46734394e-09, -6.40988385e-10,  1.30527618e-15, ...,
         4.41218484e-16,  2.75195106e-16,  2.28731917e-16]])

In [14]:
scaler = MinMaxScaler()

In [15]:
diabetes_attributes_scale = scaler.fit_transform(diabetes_attributes_dummies)

In [16]:
logistinc_reg.fit(diabetes_attributes_scale, diabetes_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
logistinc_reg.score(diabetes_attributes_scale, diabetes_target)

0.5951889629149225

In [18]:
logistinc_reg.score(diabetes_attributes_dummies, diabetes_target)



0.5387260971247765

In [19]:
pipeline = Pipeline([
    ("scaler", MinMaxScaler()),
    ("model", LogisticRegression())
])

In [20]:
pipeline

In [21]:
sample_data = diabetes_data.sample(5000, random_state=42)

In [22]:
sample_attributes = pd.get_dummies(sample_data.drop(columns = "readmitted"))

In [23]:
pipeline.fit(pd.get_dummies(sample_data.drop(columns = "readmitted")),sample_data.readmitted)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
ohe = OneHotEncoder()

In [25]:
ohe.fit(sample_attributes)

In [26]:
ohe.categories_

[array([   325848,   1139226,   1212006, ..., 443730002, 443775086,
        443824292], dtype=int64),
 array([    10827,     15849,     27315, ..., 186774602, 187042703,
        189502619], dtype=int64),
 array([1, 2, 3, 5, 6, 7, 8], dtype=int64),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 13, 14, 15, 17, 18, 22, 23,
        24, 25, 28], dtype=int64),
 array([ 1,  2,  3,  4,  5,  6,  7,  9, 17, 20], dtype=int64),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
       dtype=int64),
 array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  

In [27]:
sample_data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [28]:
columns = sample_data.dtypes[sample_data.dtypes == object].index.values

In [29]:
num_columns =  sample_data.dtypes[sample_data.dtypes != object].index.values

In [30]:
categorical_columns = columns[:-1]

In [31]:
preprocessor = ColumnTransformer([
    ("categorical", OneHotEncoder() , categorical_columns),
    ("numerical", MinMaxScaler(), num_columns)    
], remainder = "drop")

In [32]:
preprocessor

In [33]:
log_trasnform = FunctionTransformer(lambda x: np.log10(x + eps_c))

In [34]:
log_processor = Pipeline([
    ("log_transformer", log_trasnform),
    ("minmax", MinMaxScaler())
])

In [35]:
log_processor

In [36]:
preprocessor = ColumnTransformer([
    ("categorical", OneHotEncoder() , categorical_columns),
    ("numerical", log_processor, num_columns)    
], remainder = "drop")

In [37]:
preprocessor

In [38]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])

In [39]:
pipeline

In [40]:
pipeline.fit(sample_data.drop(columns="readmitted"), sample_data.readmitted)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
pipeline.score(sample_data.drop(columns="readmitted"), sample_data.readmitted)

0.659

In [None]:
pickle.dump(pipeline,open("prediction_pipeline.pkl","wb")) # primer

In [43]:
pipeline.steps[0][1]

In [44]:
sample_new_data = diabetes_data.sample(5000, random_state = 4233)
attributes = sample_new_data.drop(columns = ["readmitted"])
target = sample_new_data.readmitted

In [45]:
pipeline.fit(attributes, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
pipeline["classifier"].coef_

array([[-0.23725867,  0.10249175,  0.6170486 , ...,  0.25531203,
         0.42348301, -0.04422832],
       [-0.02531376,  0.01378539, -0.35990732, ...,  0.17543351,
         0.05730988,  0.06199109],
       [ 0.26257243, -0.11627715, -0.25714128, ..., -0.43074554,
        -0.48079288, -0.01776277]])

In [50]:
pipeline_reg = Pipeline([
     ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(C=0.000001))
])

In [51]:
pipeline_reg.fit(attributes, target)

In [53]:
pipeline_reg["classifier"].coef_

array([[-5.53805291e-06,  6.97906333e-06,  1.99516732e-06, ...,
         2.57504394e-05,  8.35543540e-05,  1.08311104e-05],
       [-1.15552621e-05, -7.96148043e-06, -4.13085988e-06, ...,
         5.84117762e-05,  1.16988376e-04,  2.11123870e-05],
       [ 1.70933150e-05,  9.82417102e-07,  2.13569256e-06, ...,
        -8.41622155e-05, -2.00542730e-04, -3.19434974e-05]])

In [63]:
attribute_train, attribute_test, target_train, target_test = train_test_split(attributes, target, test_size= 0.15, stratify=target)

In [64]:
attribute_train.shape, attribute_test.shape, target_train.shape, target_test.shape

((4250, 49), (750, 49), (4250,), (750,))

In [65]:
target.value_counts(normalize=True)

readmitted
NO     0.5490
>30    0.3456
<30    0.1054
Name: proportion, dtype: float64

In [66]:
target_train.value_counts(normalize=True)

readmitted
NO     0.548941
>30    0.345647
<30    0.105412
Name: proportion, dtype: float64

In [67]:
target_test.value_counts(normalize=True)

readmitted
NO     0.549333
>30    0.345333
<30    0.105333
Name: proportion, dtype: float64

In [68]:
pipeline.fit(attribute_train, target_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [69]:
pipeline.score(attribute_train, target_train)

0.6931764705882353

In [72]:
pipeline.score(attribute_test, target_test)

ValueError: Found unknown categories ['327', '456', '727', '250.31', '991', '945', '526'] in column 6 during transform

In [75]:
print(classification_report(target_train, pipeline.predict(attribute_train)))

              precision    recall  f1-score   support

         <30       0.82      0.15      0.26       448
         >30       0.66      0.57      0.61      1469
          NO       0.71      0.87      0.78      2333

    accuracy                           0.69      4250
   macro avg       0.73      0.53      0.55      4250
weighted avg       0.70      0.69      0.67      4250



In [77]:
roc_curve(pipeline.decision_function)

TypeError: roc_curve() missing 1 required positional argument: 'y_score'

In [82]:
pipeline.steps[0]

('preprocessor',
 ColumnTransformer(transformers=[('categorical', OneHotEncoder(),
                                  array(['race', 'gender', 'age', 'weight', 'payer_code',
        'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum',
        'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
        'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
        'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone',
        'acar...
                                                   FunctionTransformer(func=<function <lambda> at 0x000001CFD91BE700>)),
                                                  ('minmax', MinMaxScaler())]),
                                  array(['encounter_id', 'patient_nbr', 'admission_type_id',
        'discharge_disposition_id', 'admission_source_id',
        'time_in_hospital', 'num_lab_procedures', 'num_procedures',
        'num_medications', 'number_outpatient', 'number_emergency',
        'number_inpatient', 'number_diagn

In [83]:
GridSearchCV(pipeline, param_grid={
    "classifier__C": [1,10,1090],
    "preprocessor__numerical_minmax__feature_range": [(-5, 5), (-1,1)]
}, cv=10)

In [85]:
cv = GridSearchCV(pipeline, param_grid={
    "classifier__C": [1,10,1090],
    "preprocessor__numerical__minmax__feature_range": [(-5, 5), (-1,1)]
}, cv=10).fit(attributes, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "C:\Users\nushi\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nushi\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 444, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nushi\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 718, in score
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nushi\an