In [1]:
import numpy as np
import pandas as pd
from model_tuner import *

# Load data

In [2]:
from sklearn import datasets

In [3]:
dataset = datasets.load_breast_cancer()

# Split into training and Validation

In [4]:
# features
X = dataset.data
print(dataset.feature_names)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [5]:
# features
y = dataset.target
print(dataset.target_names)

['malignant' 'benign']


In [6]:
# split into train test
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
np.bincount(y_train.astype(int))

array([145, 236])

In [9]:
np.bincount(y_valid.astype(int))

array([ 67, 121])

# Scale Data

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)

In [11]:
X_train_scaled=scaler.transform(X_train)
X_valid_scaled=scaler.transform(X_valid)

# Machine learning models

In [12]:
from sklearn.metrics import roc_auc_score

## 1. Logistic Regression

In [19]:
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, solver='lbfgs')


LR = Pipeline([('LR', clf)])

penalties = ['l1', 'l2']
Cs = np.logspace(-4, 1, 20)
solvers = ['liblinear']

parameters = [{'LR__penalty':penalty, 'LR__C': c, 'LR__solver':solver} 
              for penalty in penalties
              for c in Cs
              for solver in solvers]

print('Number of parameters: ' , len(parameters))

Number of parameters:  40


In [20]:
model = ModelTuner(pipeline=LR,parameters=parameters, X_train=X_train_scaled, 
                    y_train=y_train.astype(int), X_valid=X_valid_scaled,
                    y_valid=y_valid.astype(int), eval_metric=roc_auc_score)


  0%|          | 0/40 [00:00<?, ?it/s][A
100%|██████████| 40/40 [00:00<00:00, 524.07it/s][A

In [21]:
model.best_performance

0.9983964475144936

## 2. SVM

In [22]:
from sklearn import svm

clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
            decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
            max_iter=-1, probability=True, random_state=0, shrinking=True,
            tol=0.001, verbose=False)


SVM = Pipeline([('SVM', clf)])

Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
kernels = ['rbf','poly','linear']

          
parameters = [{'SVM__gamma':gamma, 'SVM__C': c, 'SVM__kernel':kernel} 
              for gamma in gammas
              for c in Cs
              for kernel in kernels]
          
print('Number of parameters: ' , len(parameters))

Number of parameters:  60


In [23]:
model = ModelTuner(pipeline=SVM,parameters=parameters, X_train=X_train_scaled, 
                    y_train=y_train.astype(int), X_valid=X_valid_scaled,
                    y_valid=y_valid.astype(int), eval_metric=roc_auc_score)


  0%|          | 0/60 [00:00<?, ?it/s][A
  8%|▊         | 5/60 [00:00<00:01, 44.02it/s][A
 18%|█▊        | 11/60 [00:00<00:01, 47.76it/s][A
 30%|███       | 18/60 [00:00<00:00, 51.87it/s][A
 40%|████      | 24/60 [00:00<00:00, 53.73it/s][A
 53%|█████▎    | 32/60 [00:00<00:00, 58.02it/s][A
 63%|██████▎   | 38/60 [00:00<00:00, 58.42it/s][A
 77%|███████▋  | 46/60 [00:00<00:00, 62.50it/s][A
 88%|████████▊ | 53/60 [00:00<00:00, 60.14it/s][A
 98%|█████████▊| 59/60 [00:00<00:00, 57.74it/s][A
100%|██████████| 60/60 [00:00<00:00, 60.09it/s][A

In [24]:
model.best_performance

0.9980263969409152