In [1]:
%load_ext autoreload
%autoreload 2
%pylab inline

from copy import deepcopy
import itertools
import pickle

import pandas as pd
from scipy.io import savemat, loadmat
import seaborn as sns
from sklearn.metrics import (log_loss, mean_squared_error, roc_curve, auc,
                             precision_recall_fscore_support, confusion_matrix)
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate
from tqdm import tqdm, trange, tqdm_notebook as tqdmn

from BEKML import BEMKL, plot_distplot
from utils import poly_kernel, gauss_kernel, scoring, plot_kernel_importances, plot_compare_models

sns.set(style='ticks', context='talk')
np.set_printoptions(precision=4, linewidth=100)

Populating the interactive namespace from numpy and matplotlib


  return f(*args, **kwds)


In [2]:
data = pd.read_csv('data/pima.csv', names=list(range(9)))
display(data.head())
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Unnamed: 0,0,1,2,3,4,5,6,7,8
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [3]:
X = data.iloc[:, :-1].copy()
y = data.iloc[:, -1].copy()
N, D = X.shape
y.loc[y == 0] = -1
display(X.shape, y.shape, y.mean())

(768, 8)

(768,)

-0.3020833333333333

In [5]:
proc_data = data.copy()
proc_data.iloc[:, -1] = y
proc_data.to_csv('data/proc_pima.csv')

In [4]:
rbf_init, rbf_end = -3, 7
ply_init, ply_end = 1, 4

kernel_attrs = [('rbf', 'all', i) for i in range(rbf_init, rbf_end)]
kernels = [lambda A, B: gauss_kernel(A, B, 2**i)
           for i in range(rbf_init, rbf_end)]

kernel_attrs += [('poly', 'all', i) for i in range(ply_init, ply_end)]
kernels += [lambda A, B: poly_kernel(A, B, 1, i)
            for i in range(ply_init, ply_end)]

kernel_attrs += [('rbf', j, i) for i in range(rbf_init, rbf_end)
                 for j in range(D)]
kernels += [lambda A, B: gauss_kernel(A[:, j:j+1], B[:, j:j+1], 2**i)
            for i in range(rbf_init, rbf_end) for j in range(D)]

kernel_attrs += [('poly', j, i) for i in range(ply_init, ply_end)
                 for j in range(D)]
kernels += [lambda A, B: poly_kernel(A[:, j:j+1], B[:, j:j+1], 1, i)
            for i in range(ply_init, ply_end) for j in range(D)]
len(kernels), len(kernel_attrs)

(117, 117)

In [5]:
X_train, X_test, y_train, y_test =\
        train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

In [6]:
normalizer = Normalizer()
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.fit_transform(X_test)

# Baseline Model

In [7]:
max_iter = 200
base_model = BEMKL(kernels=kernels, hyp_lambda_alpha=1, hyp_lambda_beta=1,
                   hyp_gamma_alpha=1, hyp_gamma_beta=1,
                   hyp_omega_alpha=1, hyp_omega_beta=1,
                   e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                   filter_kernels=False, filter_sv=False, verbose=False,
                   max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
base_model = make_pipeline(Normalizer(), base_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
base_cv_results = cross_validate(base_model, X, y, cv=folds, scoring=scoring)
base_stats = deepcopy(scoring.stats)
base_cv_results, base_stats

0 - Kernels: 117/117 (1.0). SV: 546/576 (0.9479166666666666). Mean e: 0.7023. Median e: 1.0365. Std e: 0.4994. 
1 - Kernels: 117/117 (1.0). SV: 555/576 (0.9635416666666666). Mean e: 0.5714. Median e: 0.8998. Std e: 0.5090. 
2 - Kernels: 117/117 (1.0). SV: 543/576 (0.9427083333333334). Mean e: 0.6573. Median e: 0.9918. Std e: 0.5053. 
3 - Kernels: 117/117 (1.0). SV: 545/576 (0.9461805555555556). Mean e: 0.7092. Median e: 1.0879. Std e: 0.5701. 
4 - Kernels: 117/117 (1.0). SV: 523/576 (0.9079861111111112). Mean e: 0.6715. Median e: 1.0279. Std e: 0.5508. 
5 - Kernels: 117/117 (1.0). SV: 551/576 (0.9565972222222222). Mean e: -0.1928. Median e: 0.1428. Std e: 1.0125. 
6 - Kernels: 117/117 (1.0). SV: 539/576 (0.9357638888888888). Mean e: 0.6470. Median e: 0.9900. Std e: 0.5214. 
7 - Kernels: 117/117 (1.0). SV: 552/576 (0.9583333333333334). Mean e: 0.6625. Median e: 1.0152. Std e: 0.5404. 
8 - Kernels: 117/117 (1.0). SV: 532/576 (0.9236111111111112). Mean e: -0.0014. Median e: 0.4197. Std e:



({'fit_time': array([15.5019, 12.6639, 12.3789, 12.3022, 12.2876, 12.3563, 12.2859, 12.3808, 12.5626, 12.5666,
         13.4472, 13.0511, 12.8353, 12.7149, 12.8207, 12.6276, 12.6228, 12.6585, 12.6458, 12.7441]),
  'score_time': array([2.4586, 2.4818, 2.4146, 2.4256, 2.3946, 2.4645, 2.3888, 2.4244, 2.4332, 2.4171, 2.4157,
         2.4378, 2.4253, 2.4323, 2.4369, 2.4111, 2.4392, 2.4127, 2.4843, 2.5644]),
  'test_score': array([0.6406, 0.6458, 0.6771, 0.7448, 0.6979, 0.6146, 0.6875, 0.7188, 0.6979, 0.7188, 0.6979,
         0.6667, 0.6979, 0.6562, 0.6771, 0.6615, 0.7292, 0.6667, 0.6406, 0.7135]),
  'train_score': array([0.7031, 0.7292, 0.7101, 0.6788, 0.7049, 0.7101, 0.717 , 0.6979, 0.6997, 0.6962, 0.7205,
         0.7153, 0.7083, 0.6944, 0.7153, 0.7066, 0.7083, 0.7031, 0.7014, 0.7066])},
 [{'elapsed_time': 13.711721181869507,
   'nr_kernels_used': 117,
   'nr_sv_used': 546,
   'total_kernels': 117,
   'total_sv': 576},
  {'elapsed_time': 11.164348125457764,
   'nr_kernels_used': 117,
   '

In [8]:
base_times = np.array([s['elapsed_time'] for s in base_stats])
base_kernels = np.array([s['nr_kernels_used'] for s in base_stats])
base_sv = np.array([s['nr_sv_used'] for s in base_stats])
print(
    f"Score: {base_cv_results['test_score'].mean()} +- {base_cv_results['test_score'].std()}\n"
    f"Time: {base_times.mean()} +- {base_times.std()}\n"
    f"Kernels: {base_kernels.mean()} +- {base_kernels.std()}\n"
    f"SVs: {base_sv.mean()} +- {base_sv.std()}"
)

Score: 0.6825520833333333 +- 0.03300107025568765
Time: 11.215036976337434 +- 0.6275953914279129
Kernels: 117.0 +- 0.0
SVs: 543.35 +- 10.199387236496122


# Kernel-sparse model

In [9]:
max_iter = 200
ksparse_model = BEMKL(kernels=kernels, hyp_lambda_alpha=1, hyp_lambda_beta=1,
                      hyp_gamma_alpha=1, hyp_gamma_beta=1,
                      hyp_omega_alpha=1e-11, hyp_omega_beta=1e9,
                      e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                      filter_kernels=False, filter_sv=False, verbose=False,
                      max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
ksparse_pipeline = make_pipeline(Normalizer(), ksparse_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
ksparse_cv_results = cross_validate(ksparse_pipeline, X, y, cv=folds, scoring=scoring)
ksparse_stats = deepcopy(scoring.stats)
ksparse_cv_results, ksparse_stats

0 - Kernels: 7/117 (0.05982905982905983). SV: 549/576 (0.953125). Mean e: 0.2399. Median e: -0.0004. Std e: 1.1764. 
1 - Kernels: 17/117 (0.1452991452991453). SV: 541/576 (0.9392361111111112). Mean e: 0.4341. Median e: -0.0004. Std e: 1.1482. 
2 - Kernels: 11/117 (0.09401709401709402). SV: 555/576 (0.9635416666666666). Mean e: 0.2250. Median e: -0.0004. Std e: 1.2005. 
3 - Kernels: 5/117 (0.042735042735042736). SV: 553/576 (0.9600694444444444). Mean e: 0.2049. Median e: -0.0006. Std e: 1.2263. 
4 - Kernels: 7/117 (0.05982905982905983). SV: 551/576 (0.9565972222222222). Mean e: 0.2574. Median e: -0.0006. Std e: 1.2253. 
5 - Kernels: 14/117 (0.11965811965811966). SV: 543/576 (0.9427083333333334). Mean e: 0.3114. Median e: -0.0003. Std e: 1.1261. 
6 - Kernels: 10/117 (0.08547008547008547). SV: 542/576 (0.9409722222222222). Mean e: 0.3260. Median e: -0.0002. Std e: 1.1592. 
7 - Kernels: 21/117 (0.1794871794871795). SV: 543/576 (0.9427083333333334). Mean e: 0.4452. Median e: -0.0002. Std e:



({'fit_time': array([12.9765, 12.7867, 12.6552, 12.6999, 12.8191, 13.0737, 13.0165, 12.8808, 12.9155, 12.7537,
         12.7524, 12.7295, 13.1571, 12.4676, 12.6618, 12.662 , 12.709 , 12.3452, 12.4112, 12.1723]),
  'score_time': array([2.4676, 2.48  , 2.4501, 2.4439, 2.4756, 2.4575, 2.4322, 2.4689, 2.4459, 2.4325, 2.4507,
         2.4556, 2.4394, 2.4677, 2.448 , 2.4662, 2.5622, 2.4682, 2.4438, 2.5703]),
  'test_score': array([0.6771, 0.7448, 0.7083, 0.651 , 0.7448, 0.6458, 0.7083, 0.6979, 0.7292, 0.7188, 0.6406,
         0.6302, 0.7292, 0.6667, 0.6615, 0.6771, 0.6823, 0.6927, 0.6875, 0.6823]),
  'train_score': array([0.7049, 0.6788, 0.7014, 0.717 , 0.6944, 0.7153, 0.7066, 0.6858, 0.691 , 0.6858, 0.717 ,
         0.7222, 0.6858, 0.7049, 0.7188, 0.7083, 0.7153, 0.6997, 0.6944, 0.6962])},
 [{'elapsed_time': 11.278692245483398,
   'nr_kernels_used': 7,
   'nr_sv_used': 549,
   'total_kernels': 117,
   'total_sv': 576},
  {'elapsed_time': 11.210781335830688,
   'nr_kernels_used': 17,
   'nr_

In [10]:
ksparse_times = np.array([s['elapsed_time'] for s in ksparse_stats])
ksparse_kernels = np.array([s['nr_kernels_used'] for s in ksparse_stats])
ksparse_sv = np.array([s['nr_sv_used'] for s in ksparse_stats])
print(
    f"Score: {ksparse_cv_results['test_score'].mean()} +- {ksparse_cv_results['test_score'].std()}\n"
    f"Time: {ksparse_times.mean()} +- {ksparse_times.std()}\n"
    f"Kernels: {ksparse_kernels.mean()} +- {ksparse_kernels.std()}\n"
    f"SVs: {ksparse_sv.mean()} +- {ksparse_sv.std()}"
)

Score: 0.6888020833333333 +- 0.03309956275674195
Time: 11.154281854629517 +- 0.2249542664701269
Kernels: 10.55 +- 5.054453481831641
SVs: 542.2 +- 9.064215354899728


# SV-sparse model

In [11]:
max_iter = 200
ssparse_model = BEMKL(kernels=kernels,
                      hyp_lambda_alpha=1e-11, hyp_lambda_beta=1e9,
                      hyp_gamma_alpha=1, hyp_gamma_beta=1,
                      hyp_omega_alpha=1, hyp_omega_beta=1,
                      e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                      filter_kernels=False, filter_sv=False, verbose=False,
                      max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
ssparse_pipeline = make_pipeline(Normalizer(), ssparse_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
ssparse_cv_results = cross_validate(ssparse_pipeline, X, y, cv=folds, scoring=scoring)
ssparse_stats = deepcopy(scoring.stats)
ssparse_cv_results, ssparse_stats

0 - Kernels: 117/117 (1.0). SV: 420/576 (0.7291666666666666). Mean e: 0.6782. Median e: 1.0112. Std e: 0.5079. 
1 - Kernels: 117/117 (1.0). SV: 323/576 (0.5607638888888888). Mean e: 0.6363. Median e: 0.9627. Std e: 0.4956. 
2 - Kernels: 93/117 (0.7948717948717948). SV: 451/576 (0.7829861111111112). Mean e: 0.6684. Median e: 0.9979. Std e: 0.5035. 
3 - Kernels: 117/117 (1.0). SV: 424/576 (0.7361111111111112). Mean e: -0.2526. Median e: 0.0812. Std e: 1.0814. 
4 - Kernels: 117/117 (1.0). SV: 478/576 (0.8298611111111112). Mean e: 0.0339. Median e: 0.3520. Std e: 0.8586. 
5 - Kernels: 117/117 (1.0). SV: 424/576 (0.7361111111111112). Mean e: 0.7001. Median e: 1.0115. Std e: 0.4677. 
6 - Kernels: 117/117 (1.0). SV: 458/576 (0.7951388888888888). Mean e: 0.6437. Median e: 0.9735. Std e: 0.5011. 
7 - Kernels: 117/117 (1.0). SV: 457/576 (0.7934027777777778). Mean e: 0.7322. Median e: 1.0732. Std e: 0.5085. 
8 - Kernels: 117/117 (1.0). SV: 294/576 (0.5104166666666666). Mean e: 0.6448. Median e: 1



({'fit_time': array([12.4511, 12.4797, 12.7242, 12.277 , 12.4368, 12.4161, 12.5329, 12.7068, 12.7838, 12.4735,
         12.4523, 12.4633, 12.306 , 12.3165, 12.479 , 12.46  , 12.3513, 12.3658, 12.359 , 12.7686]),
  'score_time': array([2.4482, 2.4201, 2.4563, 2.4277, 2.4348, 2.3934, 2.4226, 2.4657, 2.4601, 2.4108, 2.4083,
         2.5128, 2.5199, 2.4102, 2.4394, 2.5251, 2.4369, 2.4187, 2.449 , 2.436 ]),
  'test_score': array([0.724 , 0.651 , 0.6354, 0.7135, 0.6042, 0.6875, 0.6354, 0.7135, 0.7188, 0.7188, 0.6615,
         0.6771, 0.7031, 0.6927, 0.6927, 0.651 , 0.6823, 0.6719, 0.6667, 0.7083]),
  'train_score': array([0.7083, 0.7101, 0.717 , 0.6927, 0.7135, 0.7066, 0.7222, 0.6771, 0.7014, 0.7049, 0.7014,
         0.6997, 0.6962, 0.7153, 0.6997, 0.6962, 0.6962, 0.7188, 0.691 , 0.7031])},
 [{'elapsed_time': 10.879601240158081,
   'nr_kernels_used': 117,
   'nr_sv_used': 420,
   'total_kernels': 117,
   'total_sv': 576},
  {'elapsed_time': 10.831420660018921,
   'nr_kernels_used': 117,
   '

In [12]:
ssparse_times = np.array([s['elapsed_time'] for s in ssparse_stats])
ssparse_kernels = np.array([s['nr_kernels_used'] for s in ssparse_stats])
ssparse_sv = np.array([s['nr_sv_used'] for s in ssparse_stats])
print(
    f"Score: {ssparse_cv_results['test_score'].mean()} +- {ssparse_cv_results['test_score'].std()}\n"
    f"Time: {ssparse_times.mean()} +- {ssparse_times.std()}\n"
    f"Kernels: {ssparse_kernels.mean()} +- {ssparse_kernels.std()}\n"
    f"SVs: {ssparse_sv.mean()} +- {ssparse_sv.std()}"
)

Score: 0.68046875 +- 0.03232839812797464
Time: 10.926219582557678 +- 0.16431580163865395
Kernels: 113.4 +- 8.569714114251422
SVs: 436.3 +- 46.16286386263313


In [13]:
import json
with open('pima_results.json', 'w') as fp:
    json.dump(
        {
            'ksparse': {
                'scores': list(ksparse_cv_results['test_score']),
                'times': list(ksparse_times),
                'kernels': [int(k) for k in ksparse_kernels],
                'svs': [int(s) for s in ksparse_sv],
            },
            'ssparse': {
                'scores': list(ssparse_cv_results['test_score']),
                'times': list(ssparse_times),
                'kernels': [int(k) for k in ksparse_kernels],
                'svs': [int(s) for s in ssparse_sv],
            },
            'base': {
                'scores': list(base_cv_results['test_score']),
                'times': list(base_times),
                'kernels': [int(k) for k in base_kernels],
                'svs': [int(s) for s in base_sv],
            },
            'total_kernels': len(kernels),
            'total_sv': len(X_train),
        },
        fp,
        indent=4,
        sort_keys=True
    )