In [1]:
%load_ext autoreload
%autoreload 2
%pylab inline

from copy import deepcopy
import itertools
import pickle

import pandas as pd
from scipy.io import savemat, loadmat
import seaborn as sns
from sklearn.metrics import (log_loss, mean_squared_error, roc_curve, auc,
                             precision_recall_fscore_support, confusion_matrix)
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate
from tqdm import tqdm, trange, tqdm_notebook as tqdmn

from BEKML import BEMKL, plot_distplot
from utils import poly_kernel, gauss_kernel, scoring, plot_kernel_importances, plot_compare_models

sns.set(style='ticks', context='talk')
np.set_printoptions(precision=4, linewidth=100)

Populating the interactive namespace from numpy and matplotlib


  return f(*args, **kwds)


In [2]:
data = pd.read_csv('data/pima.csv', names=list(range(9)))
display(data.head())
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Unnamed: 0,0,1,2,3,4,5,6,7,8
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [3]:
X = data.iloc[:, :-1].copy()
y = data.iloc[:, -1].copy()
N, D = X.shape
y.loc[y == 0] = -1
display(X.shape, y.shape, y.mean())

(768, 8)

(768,)

-0.3020833333333333

In [4]:
proc_data = data.copy()
proc_data.iloc[:, -1] = y
proc_data.to_csv('data/proc_pima.csv')

In [5]:
rbf_init, rbf_end = -3, 7
ply_init, ply_end = 1, 4

kernel_attrs = [('rbf', 'all', i) for i in range(rbf_init, rbf_end)]
kernels = [lambda A, B: gauss_kernel(A, B, 2**i)
           for i in range(rbf_init, rbf_end)]

kernel_attrs += [('poly', 'all', i) for i in range(ply_init, ply_end)]
kernels += [lambda A, B: poly_kernel(A, B, 1, i)
            for i in range(ply_init, ply_end)]

kernel_attrs += [('rbf', j, i) for i in range(rbf_init, rbf_end)
                 for j in range(D)]
kernels += [lambda A, B: gauss_kernel(A[:, j:j+1], B[:, j:j+1], 2**i)
            for i in range(rbf_init, rbf_end) for j in range(D)]

kernel_attrs += [('poly', j, i) for i in range(ply_init, ply_end)
                 for j in range(D)]
kernels += [lambda A, B: poly_kernel(A[:, j:j+1], B[:, j:j+1], 1, i)
            for i in range(ply_init, ply_end) for j in range(D)]
len(kernels), len(kernel_attrs)

(117, 117)

In [6]:
X_train, X_test, y_train, y_test =\
        train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

In [7]:
normalizer = Normalizer()
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.fit_transform(X_test)

# Baseline Model

In [8]:
max_iter = 200
base_model = BEMKL(kernels=kernels, hyp_lambda_alpha=1, hyp_lambda_beta=1,
                   hyp_gamma_alpha=1, hyp_gamma_beta=1,
                   hyp_omega_alpha=1, hyp_omega_beta=1,
                   e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                   filter_kernels=False, filter_sv=False, verbose=False,
                   max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
base_model = make_pipeline(Normalizer(), base_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
base_cv_results = cross_validate(base_model, X, y, cv=folds, scoring=scoring)
base_stats = deepcopy(scoring.stats)
base_cv_results, base_stats

0 - Kernels: 117/117 (1.0). SV: 541/576 (0.9392361111111112). Mean e: 0.6986. Median e: 1.0459. Std e: 0.5218. 
1 - Kernels: 117/117 (1.0). SV: 540/576 (0.9375). Mean e: 0.4386. Median e: 0.8449. Std e: 0.7034. 
2 - Kernels: 117/117 (1.0). SV: 538/576 (0.9340277777777778). Mean e: 0.5807. Median e: 0.9509. Std e: 0.5685. 
3 - Kernels: 93/117 (0.7948717948717948). SV: 533/576 (0.9253472222222222). Mean e: 0.5597. Median e: 0.9220. Std e: 0.6068. 
4 - Kernels: 117/117 (1.0). SV: 530/576 (0.9201388888888888). Mean e: 0.6838. Median e: 1.0177. Std e: 0.5049. 
5 - Kernels: 117/117 (1.0). SV: 538/576 (0.9340277777777778). Mean e: -0.2738. Median e: 0.0676. Std e: 1.0556. 
6 - Kernels: 117/117 (1.0). SV: 548/576 (0.9513888888888888). Mean e: 0.6656. Median e: 0.9964. Std e: 0.5021. 
7 - Kernels: 117/117 (1.0). SV: 542/576 (0.9409722222222222). Mean e: 0.6224. Median e: 0.9930. Std e: 0.5788. 
8 - Kernels: 117/117 (1.0). SV: 553/576 (0.9600694444444444). Mean e: 0.5886. Median e: 0.9753. Std e



({'fit_time': array([13.1733, 12.7444, 12.9517, 13.2829, 13.6144, 12.726 , 12.9451, 13.6712, 12.9464, 12.9307,
         13.3026, 13.2342, 13.1639, 12.8517, 12.8061, 12.9925, 12.9399, 12.8112, 13.185 , 13.3004]),
  'score_time': array([2.5339, 2.4719, 2.4768, 2.4578, 2.4283, 2.4487, 2.5707, 2.4697, 2.4502, 2.447 , 2.4782,
         2.4588, 2.4921, 2.4585, 2.4514, 2.4736, 2.4752, 2.4673, 2.4597, 2.4881]),
  'test_score': array([0.6719, 0.6823, 0.6562, 0.6771, 0.7344, 0.6719, 0.6562, 0.6927, 0.7135, 0.6875, 0.6406,
         0.6979, 0.7188, 0.7135, 0.651 , 0.6979, 0.6927, 0.6927, 0.6875, 0.6875]),
  'train_score': array([0.7014, 0.7031, 0.7257, 0.7049, 0.6944, 0.7135, 0.7066, 0.6997, 0.6962, 0.6962, 0.7049,
         0.7101, 0.7118, 0.6979, 0.7031, 0.7066, 0.7101, 0.6979, 0.6892, 0.717 ])},
 [{'elapsed_time': 11.620792627334595,
   'nr_kernels_used': 117,
   'nr_sv_used': 541,
   'total_kernels': 117,
   'total_sv': 576},
  {'elapsed_time': 11.155920505523682,
   'nr_kernels_used': 117,
   '

In [9]:
base_times = np.array([s['elapsed_time'] for s in base_stats])
base_kernels = np.array([s['nr_kernels_used'] for s in base_stats])
base_sv = np.array([s['nr_sv_used'] for s in base_stats])
print(
    f"Score: {base_cv_results['test_score'].mean()} +- {base_cv_results['test_score'].std()}\n"
    f"Time: {base_times.mean()} +- {base_times.std()}\n"
    f"Kernels: {base_kernels.mean()} +- {base_kernels.std()}\n"
    f"SVs: {base_sv.mean()} +- {base_sv.std()}"
)

Score: 0.6861979166666667 +- 0.023343271228170745
Time: 11.498796808719636 +- 0.25340436691967405
Kernels: 115.8 +- 5.230678732248809
SVs: 543.5 +- 9.7082439194738


# Kernel-sparse model

In [10]:
max_iter = 200
ksparse_model = BEMKL(kernels=kernels, hyp_lambda_alpha=1, hyp_lambda_beta=1,
                      hyp_gamma_alpha=1, hyp_gamma_beta=1,
                      hyp_omega_alpha=1e-11, hyp_omega_beta=1e9,
                      e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                      filter_kernels=False, filter_sv=False, verbose=False,
                      max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
ksparse_pipeline = make_pipeline(Normalizer(), ksparse_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
ksparse_cv_results = cross_validate(ksparse_pipeline, X, y, cv=folds, scoring=scoring)
ksparse_stats = deepcopy(scoring.stats)
ksparse_cv_results, ksparse_stats

0 - Kernels: 13/117 (0.1111111111111111). SV: 545/576 (0.9461805555555556). Mean e: 0.3764. Median e: -0.0003. Std e: 1.1518. 
1 - Kernels: 14/117 (0.11965811965811966). SV: 529/576 (0.9184027777777778). Mean e: 0.3128. Median e: -0.0003. Std e: 1.1695. 
2 - Kernels: 7/117 (0.05982905982905983). SV: 540/576 (0.9375). Mean e: 0.2948. Median e: -0.0004. Std e: 1.2263. 
3 - Kernels: 5/117 (0.042735042735042736). SV: 542/576 (0.9409722222222222). Mean e: 0.1881. Median e: -0.0005. Std e: 1.1905. 
4 - Kernels: 16/117 (0.13675213675213677). SV: 542/576 (0.9409722222222222). Mean e: 0.3144. Median e: -0.0003. Std e: 1.1089. 
5 - Kernels: 5/117 (0.042735042735042736). SV: 545/576 (0.9461805555555556). Mean e: 0.2181. Median e: -0.0003. Std e: 1.1859. 
6 - Kernels: 23/117 (0.19658119658119658). SV: 560/576 (0.9722222222222222). Mean e: 0.5080. Median e: -0.0003. Std e: 1.0745. 
7 - Kernels: 12/117 (0.10256410256410256). SV: 509/576 (0.8836805555555556). Mean e: 0.3220. Median e: -0.0003. Std e:



({'fit_time': array([13.139 , 13.0265, 12.9237, 12.7676, 13.2849, 12.8442, 13.1861, 13.1641, 12.7976, 12.797 ,
         13.1387, 13.1691, 13.2072, 12.8629, 12.9211, 13.6719, 12.6525, 13.0701, 13.0747, 12.9167]),
  'score_time': array([2.4607, 2.4644, 2.4779, 2.4522, 2.5277, 2.497 , 2.5415, 2.4576, 2.4807, 2.469 , 2.4791,
         2.4582, 2.4622, 2.4642, 2.4809, 2.4625, 2.4736, 2.5648, 2.4973, 2.4726]),
  'test_score': array([0.6667, 0.6615, 0.7292, 0.6562, 0.6198, 0.6562, 0.7083, 0.7344, 0.7292, 0.6875, 0.7188,
         0.6562, 0.7031, 0.7188, 0.6198, 0.7031, 0.7031, 0.6406, 0.6927, 0.6771]),
  'train_score': array([0.6962, 0.6979, 0.684 , 0.7222, 0.717 , 0.7153, 0.6944, 0.6806, 0.6875, 0.6944, 0.6944,
         0.7101, 0.6875, 0.6892, 0.7135, 0.7049, 0.684 , 0.6997, 0.6892, 0.7066])},
 [{'elapsed_time': 11.553470611572266,
   'nr_kernels_used': 13,
   'nr_sv_used': 545,
   'total_kernels': 117,
   'total_sv': 576},
  {'elapsed_time': 11.431266784667969,
   'nr_kernels_used': 14,
   'nr

In [11]:
ksparse_times = np.array([s['elapsed_time'] for s in ksparse_stats])
ksparse_kernels = np.array([s['nr_kernels_used'] for s in ksparse_stats])
ksparse_sv = np.array([s['nr_sv_used'] for s in ksparse_stats])
print(
    f"Score: {ksparse_cv_results['test_score'].mean()} +- {ksparse_cv_results['test_score'].std()}\n"
    f"Time: {ksparse_times.mean()} +- {ksparse_times.std()}\n"
    f"Kernels: {ksparse_kernels.mean()} +- {ksparse_kernels.std()}\n"
    f"SVs: {ksparse_sv.mean()} +- {ksparse_sv.std()}"
)

Score: 0.6841145833333334 +- 0.03471560266762467
Time: 11.459452712535859 +- 0.21046043701292183
Kernels: 11.1 +- 4.815599651133803
SVs: 539.85 +- 12.346153247064446


# SV-sparse model

In [12]:
max_iter = 200
ssparse_model = BEMKL(kernels=kernels,
                      hyp_lambda_alpha=1e-11, hyp_lambda_beta=1e9,
                      hyp_gamma_alpha=1, hyp_gamma_beta=1,
                      hyp_omega_alpha=1, hyp_omega_beta=1,
                      e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                      filter_kernels=False, filter_sv=False, verbose=False,
                      max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
ssparse_pipeline = make_pipeline(Normalizer(), ssparse_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
ssparse_cv_results = cross_validate(ssparse_pipeline, X, y, cv=folds, scoring=scoring)
ssparse_stats = deepcopy(scoring.stats)
ssparse_cv_results, ssparse_stats

0 - Kernels: 117/117 (1.0). SV: 430/576 (0.7465277777777778). Mean e: 0.6438. Median e: 0.9870. Std e: 0.5324. 
1 - Kernels: 117/117 (1.0). SV: 466/576 (0.8090277777777778). Mean e: 0.7065. Median e: 1.0397. Std e: 0.4991. 
2 - Kernels: 117/117 (1.0). SV: 465/576 (0.8072916666666666). Mean e: 0.6635. Median e: 0.9834. Std e: 0.4832. 
3 - Kernels: 117/117 (1.0). SV: 448/576 (0.7777777777777778). Mean e: -0.2505. Median e: 0.0838. Std e: 1.0360. 
4 - Kernels: 93/117 (0.7948717948717948). SV: 454/576 (0.7881944444444444). Mean e: 0.6189. Median e: 0.9434. Std e: 0.5059. 
5 - Kernels: 117/117 (1.0). SV: 464/576 (0.8055555555555556). Mean e: 0.6705. Median e: 0.9925. Std e: 0.4848. 
6 - Kernels: 117/117 (1.0). SV: 414/576 (0.71875). Mean e: 0.6528. Median e: 1.0093. Std e: 0.5419. 
7 - Kernels: 117/117 (1.0). SV: 476/576 (0.8263888888888888). Mean e: 0.5961. Median e: 0.9760. Std e: 0.5926. 
8 - Kernels: 117/117 (1.0). SV: 409/576 (0.7100694444444444). Mean e: 0.7088. Median e: 1.0375. Std 



({'fit_time': array([13.4369, 12.8753, 13.0548, 13.3108, 13.0625, 12.8787, 12.9915, 12.8958, 12.8419, 12.8681,
         13.0845, 13.2383, 12.8179, 13.0094, 13.6784, 12.962 , 13.0181, 13.5157, 13.2897, 13.3076]),
  'score_time': array([2.4726, 2.4605, 2.4607, 2.4614, 2.4767, 2.4576, 2.4761, 2.4675, 2.5137, 2.4971, 2.4716,
         2.4964, 2.4867, 2.4841, 2.4933, 2.4506, 2.4542, 2.4671, 2.4692, 2.5383]),
  'test_score': array([0.6979, 0.6771, 0.7031, 0.6927, 0.6667, 0.6979, 0.6823, 0.6875, 0.7188, 0.6667, 0.7135,
         0.7031, 0.7188, 0.7031, 0.6354, 0.7083, 0.6458, 0.6615, 0.7604, 0.6719]),
  'train_score': array([0.717 , 0.7066, 0.7031, 0.7066, 0.7118, 0.6997, 0.7083, 0.7135, 0.691 , 0.7153, 0.7066,
         0.7101, 0.6997, 0.6979, 0.724 , 0.6979, 0.7135, 0.7118, 0.6892, 0.7049])},
 [{'elapsed_time': 11.846123933792114,
   'nr_kernels_used': 117,
   'nr_sv_used': 430,
   'total_kernels': 117,
   'total_sv': 576},
  {'elapsed_time': 11.306482553482056,
   'nr_kernels_used': 117,
   '

In [13]:
ssparse_times = np.array([s['elapsed_time'] for s in ssparse_stats])
ssparse_kernels = np.array([s['nr_kernels_used'] for s in ssparse_stats])
ssparse_sv = np.array([s['nr_sv_used'] for s in ssparse_stats])
print(
    f"Score: {ssparse_cv_results['test_score'].mean()} +- {ssparse_cv_results['test_score'].std()}\n"
    f"Time: {ssparse_times.mean()} +- {ssparse_times.std()}\n"
    f"Kernels: {ssparse_kernels.mean()} +- {ssparse_kernels.std()}\n"
    f"SVs: {ssparse_sv.mean()} +- {ssparse_sv.std()}"
)

Score: 0.690625 +- 0.02787310033387427
Time: 11.537722265720367 +- 0.2400046812824365
Kernels: 115.8 +- 5.230678732248809
SVs: 428.7 +- 42.898834483001984


In [14]:
import json
with open('pima_results.json', 'w') as fp:
    json.dump(
        {
            'ksparse': {
                'scores': list(ksparse_cv_results['test_score']),
                'times': list(ksparse_times),
                'kernels': [int(k) for k in ksparse_kernels],
                'svs': [int(s) for s in ksparse_sv],
            },
            'ssparse': {
                'scores': list(ssparse_cv_results['test_score']),
                'times': list(ssparse_times),
                'kernels': [int(k) for k in ssparse_kernels],
                'svs': [int(s) for s in ssparse_sv],
            },
            'base': {
                'scores': list(base_cv_results['test_score']),
                'times': list(base_times),
                'kernels': [int(k) for k in base_kernels],
                'svs': [int(s) for s in base_sv],
            },
            'total_kernels': len(kernels),
            'total_sv': len(X_train),
        },
        fp,
        indent=4,
        sort_keys=True
    )