In [1]:
%load_ext autoreload
%autoreload 2

! pip install -i https://test.pypi.org/simple/ my-krml-24925125==0.1.9

Looking in indexes: https://test.pypi.org/simple/


In [5]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv("../data/raw/CVD_cleaned.csv")

In [7]:
df.head(5)

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [8]:
df.shape

(308854, 19)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   General_Health                308854 non-null  object 
 1   Checkup                       308854 non-null  object 
 2   Exercise                      308854 non-null  object 
 3   Heart_Disease                 308854 non-null  object 
 4   Skin_Cancer                   308854 non-null  object 
 5   Other_Cancer                  308854 non-null  object 
 6   Depression                    308854 non-null  object 
 7   Diabetes                      308854 non-null  object 
 8   Arthritis                     308854 non-null  object 
 9   Sex                           308854 non-null  object 
 10  Age_Category                  308854 non-null  object 
 11  Height_(cm)                   308854 non-null  float64
 12  Weight_(kg)                   308854 non-nul

In [7]:
df.describe()

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
count,308854.0,308854.0,308854.0,308854.0,308854.0,308854.0,308854.0
mean,170.615249,83.588655,28.626211,5.096366,29.8352,15.110441,6.296616
std,10.658026,21.34321,6.522323,8.199763,24.875735,14.926238,8.582954
min,91.0,24.95,12.02,0.0,0.0,0.0,0.0
25%,163.0,68.04,24.21,0.0,12.0,4.0,2.0
50%,170.0,81.65,27.44,1.0,30.0,12.0,4.0
75%,178.0,95.25,31.85,6.0,30.0,20.0,8.0
max,241.0,293.02,99.33,30.0,120.0,128.0,128.0


## Build pipeline

In [33]:
df_cleaned = df.copy()

target = df_cleaned.pop("Heart_Disease")
target = target.map({"Yes": 1, "No": 0})

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import SGDClassifier

In [52]:
num_transformer = Pipeline(steps=[("num_scaler", StandardScaler())])
cat_transformer = Pipeline(steps=[("cat_encoder", OneHotEncoder())])
age_ord_transformer = Pipeline(steps=[("age_encoder", OrdinalEncoder())])
health_ord_transformer = Pipeline(steps=[("health_encoder", OrdinalEncoder())])
checkup_ord_transformer = Pipeline(steps=[("checkup_encoder", OrdinalEncoder())])

In [53]:
ord_cols = ["Age_Category", "General_Health", "Checkup"]
num_cols = df_cleaned.select_dtypes(np.number).columns.to_list()
cat_cols = list(set(df_cleaned.columns).difference(num_cols + ord_cols))

In [54]:
from sklearn.compose import ColumnTransformer

In [55]:
preprocessor = ColumnTransformer(
    transformers=[("num_transformer", num_transformer, num_cols),
                  ("cat_transformer", cat_transformer, cat_cols),
                  ("age_ord_transformer", age_ord_transformer, ["Age_Category"]),
                  ("health_ord_transformer", health_ord_transformer, ["General_Health"]),
                  ("checkup_ord_transformer", checkup_ord_transformer, ["Checkup"]),
                 ]
)

In [56]:
sgd_pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("sgd", SGDClassifier())])

In [57]:
Pipeline?

[0;31mInit signature:[0m [0mPipeline[0m[0;34m([0m[0msteps[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0mmemory[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Pipeline of transforms with a final estimator.

Sequentially apply a list of transforms and a final estimator.
Intermediate steps of the pipeline must be 'transforms', that is, they
must implement `fit` and `transform` methods.
The final estimator only needs to implement `fit`.
The transformers in the pipeline can be cached using ``memory`` argument.

The purpose of the pipeline is to assemble several steps that can be
cross-validated together while setting different parameters. For this, it
enables setting parameters of the various steps using their names and the
parameter name separated by a `'__'`, as in the example below. A step's
estimator may be replaced entirely by setting the parameter with its name
to another est

In [61]:
sgd_pipe.fit(df_cleaned, target)

In [62]:
sgd_pipe.predict(df_cleaned)

array([0, 0, 0, ..., 0, 0, 0])

In [80]:
obs = df_cleaned.iloc[0, :].to_frame().transpose()
sgd_pipe.predict(obs)

array([0])

In [82]:
import joblib

joblib.dump(sgd_pipe, "../models/sgd_pipe.joblib")

['../models/sgd_pipe.joblib']