In [1]:
import numpy as np
import pandas as pd
from model_tuner import *

# Load data

In [2]:
from sklearn import datasets

In [3]:
dataset = datasets.load_breast_cancer()

# Split into training and Validation

In [4]:
# features
X = dataset.data
print(dataset.feature_names)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [5]:
# features
y = dataset.target
print(dataset.target_names)

['malignant' 'benign']


In [6]:
# split into train test
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
np.bincount(y_train.astype(int))

array([145, 236])

In [8]:
np.bincount(y_valid.astype(int))

array([ 67, 121])

# Scale Data

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)

In [10]:
X_train_scaled=scaler.transform(X_train)
X_valid_scaled=scaler.transform(X_valid)

# Machine learning models

In [11]:
from sklearn.metrics import roc_auc_score

## 1. Logistic Regression

In [12]:
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, solver='lbfgs')


LR = Pipeline([('LR', clf)])

penalties = ['l1', 'l2']
Cs = np.logspace(-4, 1, 20)
solvers = ['liblinear']

parameters = [{'LR__penalty':penalty, 'LR__C': c, 'LR__solver':solver} 
              for penalty in penalties
              for c in Cs
              for solver in solvers]

print('Number of parameters: ' , len(parameters))

Number of parameters:  40


In [13]:
model = ModelTuner(pipeline=LR,parameters=parameters, X_train=X_train_scaled, 
                    y_train=y_train.astype(int), X_valid=X_valid_scaled,
                    y_valid=y_valid.astype(int), eval_metric=roc_auc_score)

100%|██████████| 40/40 [00:00<00:00, 521.59it/s]


In [14]:
model.best_performance

0.9983964475144936

## 2. SVM

In [15]:
from sklearn import svm

clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
            decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
            max_iter=-1, probability=True, random_state=0, shrinking=True,
            tol=0.001, verbose=False)


SVM = Pipeline([('SVM', clf)])

Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
kernels = ['rbf','poly','linear']

          
parameters = [{'SVM__gamma':gamma, 'SVM__C': c, 'SVM__kernel':kernel} 
              for gamma in gammas
              for c in Cs
              for kernel in kernels]
          
print('Number of parameters: ' , len(parameters))

Number of parameters:  60


In [16]:
model = ModelTuner(pipeline=SVM,parameters=parameters, X_train=X_train_scaled, 
                    y_train=y_train.astype(int), X_valid=X_valid_scaled,
                    y_valid=y_valid.astype(int), eval_metric=roc_auc_score)

100%|██████████| 60/60 [00:01<00:00, 56.75it/s]


In [17]:
model.best_performance

0.9980263969409152

## 3. Random Forest and random sampling

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)


RF = Pipeline([('RF', clf)])

bootstraps = [True, False]
max_depths = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
max_features = ['auto', 'sqrt']
min_samples_leafs = [1, 2, 4]
min_samples_splits = [2, 5, 10]
n_estimators = np.arange(10,500,50)
criteria = ['gini','entropy']

          
parameters = [{'RF__bootstrap': bootstrap,
               'RF__max_depth': depth,
               'RF__max_features': feat,
               'RF__min_samples_leaf': leaf,
               'RF__min_samples_split': split,
               'RF__n_estimators': estimators,
               'RF__criterion': criterion}

              for bootstrap in bootstraps
              for depth in max_depths
              for feat in max_features
              for leaf in min_samples_leafs
              for split in min_samples_splits
              for estimators in n_estimators
              for criterion in criteria]
          
print('Number of parameters: ' , len(parameters))

Number of parameters:  7920


In [20]:
#RANDOM SAMPLING
random_inds = np.random.RandomState(20).randint(0,len(parameters),size=40)

parameters = np.array(parameters)
random_params = parameters[random_inds]

print('Number of random parameters: ' , len(random_params))

Number of random parameters:  40


In [21]:
model = ModelTuner(pipeline=RF,parameters=random_params, X_train=X_train_scaled, 
                    y_train=y_train.astype(int), X_valid=X_valid_scaled,
                    y_valid=y_valid.astype(int), eval_metric=roc_auc_score)

100%|██████████| 40/40 [00:12<00:00,  2.86it/s]


In [22]:
model.best_performance

0.997224620698162

## 4. XGBoost and Random hyper parameter sampling

In [24]:
import xgboost

In [25]:
clf = xgboost.XGBClassifier(
             learning_rate =0.1,
             n_estimators=1000,
             max_depth=5,
             min_child_weight=1,
             gamma=0,
             subsample=0.8,
             colsample_bytree=0.8,
             objective= 'binary:logistic',
             nthread=10,
             scale_pos_weight = np.bincount(y_train)[0]/np.bincount(y_train)[1],
             seed=27)


XGB = Pipeline([('XGB', clf)])


learning_rates = [0.05, 0.10, 0.15]
max_depths = [6, 8, 10, 12, 15]
min_child_weights = [ 1, 3, 5, 7 ]
gammas = [ 0.1, 0.2 , 0.3, 0.4 ]
colsample_bytrees = [ 0.3, 0.4, 0.5 , 0.7 ]
n_estimators = np.arange(100,300,50)



parameters = [{  'XGB__learning_rate'    : learning_rate,
                 'XGB__max_depth'        : depth,
                 'XGB__min_child_weight' : min_child_weight,
                 'XGB__gamma'            : gamma,
                 'XGB__colsample_bytree' : colsample_bytree,
                 'XGB__n_estimators' : estimators}

              for learning_rate in learning_rates
              for depth in max_depths
              for min_child_weight in min_child_weights
              for gamma in gammas
              for colsample_bytree in colsample_bytrees
              for estimators in n_estimators]
          
print('Number of parameters: ' , len(parameters))

Number of parameters:  3840


In [27]:
#RANDOM SAMPLING
random_inds = np.unique(np.random.RandomState(20)\
                .randint(0,len(parameters),size=10))

parameters = np.array(parameters)
random_params = parameters[random_inds]

print('Number of random parameters: ' , len(random_params))

Number of random parameters:  10


In [28]:
model = ModelTuner(pipeline=XGB,parameters=random_params, 
                   X_train=X_train_scaled, 
                    y_train=y_train.astype(int), 
                    X_valid=X_valid_scaled,
                    y_valid=y_valid.astype(int), eval_metric=roc_auc_score)

100%|██████████| 10/10 [00:00<00:00, 32.38it/s]


In [29]:
model.best_performance

0.9979030467497225