In [1]:
%config Completer.use_jedi = False

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import date

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import sklearn.externals
import joblib

In [4]:
df = pd.read_csv('feature_store/feature_data_m1.csv')
print(f"shape={df.shape}")
df.head()

shape=(686, 10)


Unnamed: 0,PatientID,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,PregnanciesCount,Age,Outcome
0,1017,101,58,17,265,24.2,0.614,2.0,23,0
1,1031,108,70,0,0,30.5,0.955,8.0,33,1
2,1033,148,60,27,318,30.9,0.15,4.0,29,1
3,1035,113,76,0,0,33.3,0.278,0.0,23,1
4,1048,83,86,19,0,29.3,0.317,4.0,34,0


In [8]:
X = df.drop(columns=["Outcome"])
y = df.Outcome

In [9]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.20)

In [11]:
drop_columns = ['PatientID']
select_columns = ['Glucose', 'BMI', 'PregnanciesCount', 'Age']

In [12]:
column_transformer = ColumnTransformer(transformers=[('drop_columns', 'drop', drop_columns),
                                                    ('scale_features', StandardScaler(), select_columns)])

In [13]:
lr_model = LogisticRegression()

In [14]:
pipeline = Pipeline(steps=[
    ('pre', column_transformer),
    ('model', lr_model)
])

In [15]:
pipeline.fit(train_X, train_y)

Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('drop_columns', 'drop',
                                                  ['PatientID']),
                                                 ('scale_features',
                                                  StandardScaler(),
                                                  ['Glucose', 'BMI',
                                                   'PregnanciesCount',
                                                   'Age'])])),
                ('model', LogisticRegression())])

In [16]:
prediction = pipeline.predict(train_X)

In [18]:
accuracy_score(prediction, train_y)

0.7682481751824818

In [19]:
confusion_matrix(prediction, train_y)

array([[308,  81],
       [ 46, 113]])

In [21]:
print(classification_report(prediction, train_y))

              precision    recall  f1-score   support

           0       0.87      0.79      0.83       389
           1       0.58      0.71      0.64       159

    accuracy                           0.77       548
   macro avg       0.73      0.75      0.73       548
weighted avg       0.79      0.77      0.77       548



In [22]:
joblib.dump(pipeline, 'models/pipeline_fs_v1.pkl')

['models/pipeline_fs_v1.pkl']

In [23]:
test_pipeline = joblib.load('models/pipeline_fs_v1.pkl')

In [24]:
test_prediction = test_pipeline.predict(test_X)

In [25]:
accuracy_score(test_prediction, test_y)

0.7753623188405797

In [26]:
#############################
# Retrain the model with a new feature set
#############################
select_columns_v2 = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
                    'DiabetesPedigreeFunction', 'PregnanciesCount', 'Age']

In [27]:
column_transformer_v2 = ColumnTransformer(transformers=[
    ('drop_columns', 'drop', drop_columns),
    ('scale_features', StandardScaler(), select_columns_v2)
])

In [28]:
pipeline_v2 = Pipeline(steps=[
    ('pre', column_transformer_v2),
    ('model', lr_model)
])

In [29]:
pipeline_v2.fit(train_X, train_y)

Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('drop_columns', 'drop',
                                                  ['PatientID']),
                                                 ('scale_features',
                                                  StandardScaler(),
                                                  ['Glucose', 'BloodPressure',
                                                   'SkinThickness', 'Insulin',
                                                   'BMI',
                                                   'DiabetesPedigreeFunction',
                                                   'PregnanciesCount',
                                                   'Age'])])),
                ('model', LogisticRegression())])

In [30]:
prediction_v2 = pipeline_v2.predict(train_X)

In [31]:
accuracy_score(prediction_v2, train_y)

0.7828467153284672

In [32]:
confusion_matrix(prediction_v2, train_y)

array([[312,  77],
       [ 42, 117]])

In [33]:
print(classification_report(prediction_v2, train_y))

              precision    recall  f1-score   support

           0       0.88      0.80      0.84       389
           1       0.60      0.74      0.66       159

    accuracy                           0.78       548
   macro avg       0.74      0.77      0.75       548
weighted avg       0.80      0.78      0.79       548



In [35]:
joblib.dump(pipeline_v2, 'models/pipeline_fs_v2.pkl')

['models/pipeline_fs_v2.pkl']

In [36]:
test_pipeline_v2 = joblib.load('models/pipeline_fs_v2.pkl')

In [38]:
test_prediction_v2 = test_pipeline_v2.predict(test_X)

In [39]:
accuracy_score(test_prediction_v2, test_y)

0.7681159420289855

In [40]:
confusion_matrix(test_prediction_v2, test_y)

array([[84, 23],
       [ 9, 22]])

In [41]:
print(classification_report(test_prediction_v2, test_y))

              precision    recall  f1-score   support

           0       0.90      0.79      0.84       107
           1       0.49      0.71      0.58        31

    accuracy                           0.77       138
   macro avg       0.70      0.75      0.71       138
weighted avg       0.81      0.77      0.78       138

