In [54]:
import pathlib
import random

import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC

In [55]:
SEED = 0
np.random.seed(0)
random.seed(0)

In [56]:
%load_ext autoreload
%autoreload 2

from heat_diss import clean_data, feature_target_split, BinaryEncoder

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
data_path = pathlib.Path("data", "data.zip")

In [58]:
data = pd.read_csv(data_path)
data = data.infer_objects()

In [59]:
profile = ProfileReport(data, title='Pandas Profiling Report', explorative=True)

In [60]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/27 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [61]:
cleaned_data = clean_data(data, 4)
features, target = feature_target_split(cleaned_data, "target")

In [62]:
np.unique(target)

array([0, 1], dtype=int64)

In [63]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 302 entries, 0 to 302
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       302 non-null    int64   
 1   sex       302 non-null    category
 2   cp        302 non-null    category
 3   trestbps  302 non-null    int64   
 4   chol      302 non-null    int64   
 5   fbs       302 non-null    category
 6   restecg   302 non-null    category
 7   thalach   302 non-null    int64   
 8   exang     302 non-null    category
 9   oldpeak   302 non-null    float64 
 10  slope     302 non-null    category
 11  ca        302 non-null    int64   
 12  thal      302 non-null    category
dtypes: category(7), float64(1), int64(5)
memory usage: 19.6 KB


In [64]:
cat_columns = [col for col in features.select_dtypes("category").columns if len(features[col].unique()) > 2]
num_columns = [col for col in features.select_dtypes("number").columns]
bin_variables = [col for col in features.select_dtypes("category").columns if col not in cat_columns]

In [65]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [66]:
bin_variables

['sex', 'fbs', 'exang']

In [67]:
cat_columns

['cp', 'restecg', 'slope', 'thal']

In [90]:
mapper = ColumnTransformer(
    [ ('categorical', OneHotEncoder(handle_unknown="ignore"), cat_columns),
#       ('title_bow', BinaryEncoder(), bin_variables),
         ("numeric", StandardScaler(), num_columns)],
     remainder='drop')

In [91]:
# mapper = compose.ColumnTransformer([
#     (cat_columns, OneHotEncoder()),
#     (bin_variables, BinaryEncoder()),
#     (num_columns, StandardScaler()),
# ])

In [92]:
mapper.fit_transform(features)

array([[ 0.        ,  0.        ,  0.        , ...,  0.01882584,
         1.08402203, -0.71491124],
       [ 0.        ,  0.        ,  1.        , ...,  1.63697881,
         2.11892611, -0.71491124],
       [ 0.        ,  1.        ,  0.        , ...,  0.98097085,
         0.30784398, -0.71491124],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -0.37477894,
         2.0326841 ,  1.27497996],
       [ 1.        ,  0.        ,  0.        , ..., -1.51185941,
         0.13535997,  0.28003436],
       [ 0.        ,  1.        ,  0.        , ...,  1.06843858,
        -0.8995441 ,  0.28003436]])

In [113]:
pipeline = Pipeline([("feature_transform", mapper), ('cls', LogisticRegression())])

In [114]:
# pipeline.fit(features, target)

In [119]:
scoring = {"Accuracy": "accuracy", "f1_score": "f1"}
param_grid = {
    # "cls__max_depth": [10, 15, 20, 30, 50]
#     "cls__penalty": np.logspace(-4, 4, 4),
    "cls__C": [0.1, 1, 2, 10],
    "cls__max_iter": [200, 10_000],
    "cls__penalty": ["l2", "l1"]
}

In [120]:
gs = GridSearchCV(pipeline, param_grid=param_grid, scoring=scoring, refit="f1_score", return_train_score=True, cv=3)

gs.fit(features, target)

Traceback (most recent call last):
  File "D:\AnacondaEnvs\ml-prod1\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\AnacondaEnvs\ml-prod1\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "D:\AnacondaEnvs\ml-prod1\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "D:\AnacondaEnvs\ml-prod1\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "D:\AnacondaEnvs\ml-prod1\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('feature_transform',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['cp',
                                                                          'restecg',
                                                                          'slope',
                                                                          'thal']),
                                                                        ('numeric',
                                                                         StandardScaler(),
                                                                         ['age',
                                                                          'trestbps',
                                             

In [121]:
results = pd.DataFrame.from_dict(gs.cv_results_)

In [118]:
results[results["rank_test_Accuracy"] <= 2]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_cls__C,param_cls__max_iter,param_cls__penalty,params,split0_test_Accuracy,split1_test_Accuracy,...,split1_test_f1_score,split2_test_f1_score,mean_test_f1_score,std_test_f1_score,rank_test_f1_score,split0_train_f1_score,split1_train_f1_score,split2_train_f1_score,mean_train_f1_score,std_train_f1_score
2,0.017334,0.001248,0.007014,1.5e-05,1,200,l2,"{'cls__C': 1, 'cls__max_iter': 200, 'cls__pena...",0.841584,0.861386,...,0.867925,0.806723,0.846351,0.028058,1,0.876106,0.862222,0.897778,0.878702,0.014631
3,0.017005,0.002163,0.006344,0.00095,1,10000,l2,"{'cls__C': 1, 'cls__max_iter': 10000, 'cls__pe...",0.841584,0.861386,...,0.867925,0.806723,0.846351,0.028058,1,0.876106,0.862222,0.897778,0.878702,0.014631
