In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer

from sklearn.linear_model import LogisticRegression, Lasso, Ridge, ElasticNet

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import classification_report, roc_curve

## Model Training and Improvement

### Live Demos

In [2]:
EPS = 1e-10

In [3]:
diabetes_data = pd.read_csv("diabetic_data.csv")

In [4]:
diabetes_data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [5]:
diabetes_attributes = diabetes_data.drop(columns="readmitted")
diabetes_target = diabetes_data.readmitted

In [6]:
diabetes_attributes_dummies = pd.get_dummies(diabetes_attributes)

In [7]:
logistic_regression = LogisticRegression()

In [8]:
logistic_regression.fit(diabetes_attributes_dummies, diabetes_target)

In [9]:
logistic_regression.coef_

array([[-3.31701395e-09, -3.86905107e-09, -1.15814534e-15, ...,
        -3.26369841e-16, -1.63318872e-16, -3.02903976e-16],
       [-1.50329991e-10,  4.51003945e-09, -1.47130836e-16, ...,
        -1.14848644e-16, -1.11876233e-16,  7.41720589e-17],
       [ 3.46734394e-09, -6.40988385e-10,  1.30527618e-15, ...,
         4.41218484e-16,  2.75195106e-16,  2.28731917e-16]])

In [10]:
scaler = MinMaxScaler()

In [11]:
diabetes_attributes_scaled = scaler.fit_transform(diabetes_attributes_dummies)

In [12]:
logistic_regression.fit(diabetes_attributes_scaled, diabetes_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
logistic_regression.score(diabetes_attributes_scaled, diabetes_target)

0.5951987893795571

In [14]:
logistic_regression.score(diabetes_attributes_dummies, diabetes_target)



0.5387162706601419

In [15]:
pipeline = Pipeline([
    ("scaler", MinMaxScaler()), 
    ("model", LogisticRegression())
])

In [16]:
pipeline

In [17]:
sample_data = diabetes_data.sample(5000, random_state=42)

In [18]:
sample_atributes = sample_data.drop(columns="readmitted")

In [19]:
sample_target = sample_data.readmitted

In [20]:
sample_atributes = pd.get_dummies(sample_atributes)

In [21]:
pipeline.fit(sample_atributes, sample_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
ohe = OneHotEncoder()

In [23]:
ohe.fit(sample_atributes)

In [24]:
ohe.feature_names_in_

array(['encounter_id', 'patient_nbr', 'admission_type_id', ...,
       'change_No', 'diabetesMed_No', 'diabetesMed_Yes'], dtype=object)

In [25]:
ohe.categories_

[array([   325848,   1139226,   1212006, ..., 443730002, 443775086,
        443824292], dtype=int64),
 array([    10827,     15849,     27315, ..., 186774602, 187042703,
        189502619], dtype=int64),
 array([1, 2, 3, 5, 6, 7, 8], dtype=int64),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 13, 14, 15, 17, 18, 22, 23,
        24, 25, 28], dtype=int64),
 array([ 1,  2,  3,  4,  5,  6,  7,  9, 17, 20], dtype=int64),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
       dtype=int64),
 array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  

In [26]:
sample_data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [27]:
columns = sample_data.dtypes[sample_data.dtypes == np.object_].index.values

In [28]:
categorial_columns = columns[: -1]

In [29]:
numerical_columns = ['admission_type_id', 'discharge_disposition_id', 'time_in_hospital', 
                     'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses']

In [30]:
preprocessor = ColumnTransformer([
    ("categorical", OneHotEncoder(), categorial_columns),
    ("numerical", MinMaxScaler(), numerical_columns )
], remainder="passthrough")

In [31]:
preprocessor

In [32]:
log_transformer = FunctionTransformer(np.log10)

In [33]:
number_processor = Pipeline([
    ('log_transformer', FunctionTransformer(lambda x: np.log10(x + EPS))), 
    ('minmax', MinMaxScaler(feature_range=(-5, 5)))
])

In [34]:
number_processor

In [35]:
preprocessor = ColumnTransformer([
    ("categorical", OneHotEncoder(), categorial_columns),
    ("numerical", number_processor, numerical_columns )
])

In [36]:
preprocessor

In [37]:
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression())
]
)

In [38]:
sample_atributes = sample_data.drop(columns='readmitted')
sample_target = sample_data.readmitted

In [39]:
pipeline.fit(sample_atributes, sample_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
pipeline.score(sample_atributes, sample_target)

0.631

In [41]:
pickle.dump(logistic_regression, open('prediction_pipline.pkl', 'wb'))

In [42]:
pickle.load(open("prediction_pipline.pkl", "rb"))

In [43]:
pipeline.steps[0][1].feature_names_in_

array(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id',
       'admission_source_id', 'time_in_hospital', 'payer_code',
       'medical_specialty', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol',
       'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed'], dtype=object)

In [44]:
diabetes_data = pd.read_csv('diabetic_data.csv')

In [45]:
sample_data = diabetes_data.sample(5000, random_state=12341234)
atributes = sample_data.drop(columns=["readmitted"])
target = sample_data.readmitted

In [46]:
pipeline.fit(atributes, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
pipeline["classifier"].coef_

array([[-0.5010724 ,  0.16844334, -0.21929032, ...,  0.02630075,
         0.0365515 ,  0.07877146],
       [ 0.23919416, -0.1004881 , -0.14741907, ...,  0.01196352,
         0.02201129, -0.00159013],
       [ 0.26187825, -0.06795524,  0.36670939, ..., -0.03826427,
        -0.05856279, -0.07718133]])

In [48]:
pipeline_reg = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression(C = 10000))
]
)

In [49]:
pipeline_reg.fit(atributes, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
pipeline_reg["classifier"].coef_

array([[-0.64316637,  0.22337978, -0.28483914, ...,  0.00947924,
         0.03498402,  0.04648804],
       [ 0.32045079, -0.12696078, -0.18719613, ...,  0.0175543 ,
         0.01826456,  0.01393126],
       [ 0.32271558, -0.096419  ,  0.47203527, ..., -0.02703354,
        -0.05324858, -0.06041931]])

In [51]:
atributes_train, atributes_test, target_train, target_test = train_test_split(
    atributes,
    target, 
    test_size=0.18, 
    random_state=121212, 
    stratify=target
)

In [52]:
atributes_train.shape, atributes_test.shape, target_train.shape, target_test.shape

((4100, 49), (900, 49), (4100,), (900,))

In [53]:
target.value_counts(normalize=True)

readmitted
NO     0.5576
>30    0.3360
<30    0.1064
Name: proportion, dtype: float64

In [54]:
target_train.value_counts(normalize=True)

readmitted
NO     0.557561
>30    0.336098
<30    0.106341
Name: proportion, dtype: float64

In [55]:
target_test.value_counts(normalize=True)

readmitted
NO     0.557778
>30    0.335556
<30    0.106667
Name: proportion, dtype: float64

In [56]:
pipeline.fit(atributes_train, target_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [57]:
pipeline.score(atributes_train, target_train)

0.6587804878048781

In [58]:
pipeline.score(atributes_test, target_test)

ValueError: Found unknown categories ['Rheumatology', 'Dermatology', 'InfectiousDiseases'] in column 5 during transform

In [None]:
print(classification_report(target_train, pipeline.predict(atributes_train)))

In [None]:
roc_curve(pipeline.decision_function())

In [59]:
pipeline["classifier"].C = 1000

In [60]:
pipeline

In [61]:
pipeline.steps

[('preprocess',
  ColumnTransformer(transformers=[('categorical', OneHotEncoder(),
                                   array(['race', 'gender', 'age', 'weight', 'payer_code',
         'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum',
         'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
         'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
         'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone',
         'acar...
                                   Pipeline(steps=[('log_transformer',
                                                    FunctionTransformer(func=<function <lambda> at 0x00000180FA82A980>)),
                                                   ('minmax',
                                                    MinMaxScaler(feature_range=(-5,
                                                                                5)))]),
                                   ['admission_type_id',
                                    'dis

In [62]:
GridSearchCV(pipeline, param_grid={
    "classifier__C": [1, 10, 100],
    "preprocess__numerical__minmax_feature_range": [(-1, 1), (0,1), (-5, 5)]
}, cv = 10)

In [64]:
CV = GridSearchCV(pipeline, param_grid = {
    "classifier__C": [1, 10, 100],
    "preprocess__numerical__minmax__feature_range": [(-1, 1), (0, 1), (-5, 5)]
}, cv=2).fit(atributes, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "C:\Users\VioletaAtanasova\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\VioletaAtanasova\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 444, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\VioletaAtanasova\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 718, in score
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^

In [65]:
CV

In [66]:
CV.best_estimator_

In [67]:
CV.best_params_

{'classifier__C': 1, 'preprocess__numerical__minmax__feature_range': (-1, 1)}

In [68]:
CV.cv_results_

{'mean_fit_time': array([0.40936339, 0.29344821, 0.28714097, 0.3100245 , 0.29804611,
        0.31027305, 0.31403255, 0.29901218, 0.31764412]),
 'std_fit_time': array([0.03967369, 0.00948048, 0.00205553, 0.00802398, 0.00695848,
        0.01327574, 0.00611305, 0.00804377, 0.00164604]),
 'mean_score_time': array([0.01251161, 0.01103151, 0.01143765, 0.01099741, 0.0114994 ,
        0.01098871, 0.01301408, 0.01403511, 0.01348472]),
 'std_score_time': array([1.52218342e-03, 7.21216202e-05, 4.38213348e-04, 9.41753387e-06,
        4.98771667e-04, 2.16960907e-05, 1.01733208e-03, 3.03733349e-03,
        1.48224831e-03]),
 'param_classifier__C': masked_array(data=[1, 1, 1, 10, 10, 10, 100, 100, 100],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_preprocess__numerical__minmax__feature_range': masked_array(data=[(-1, 1), (0, 1), (-5, 5), (-1, 1), (0, 1), (-5, 5),
                    