In [1]:
%load_ext autoreload
%autoreload 2
%pylab inline

from copy import deepcopy
import itertools
import pickle

import pandas as pd
from scipy.io import savemat, loadmat
import seaborn as sns
from sklearn.metrics import (log_loss, mean_squared_error, roc_curve, auc,
                             precision_recall_fscore_support, confusion_matrix)
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate
from tqdm import tqdm, trange, tqdm_notebook as tqdmn

from BEKML import BEMKL, plot_distplot
from utils import poly_kernel, gauss_kernel, scoring, plot_kernel_importances, plot_compare_models

sns.set(style='ticks', context='talk')
np.set_printoptions(precision=4, linewidth=100)

Populating the interactive namespace from numpy and matplotlib


  return f(*args, **kwds)


In [2]:
data = pd.read_csv('data/breast.csv', names=list(range(11)), index_col=0, na_values='?')
display(data.describe())
data = data.dropna()
display(data.head())
data.describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
count,699.0,699.0,699.0,699.0,699.0,683.0,699.0,699.0,699.0,699.0
mean,4.41774,3.134478,3.207439,2.806867,3.216023,3.544656,3.437768,2.866953,1.589413,2.689557
std,2.815741,3.051459,2.971913,2.855379,2.2143,3.643857,2.438364,3.053634,1.715078,0.951273
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1.0,3,1,1,2
1002945,5,4,4,5,7,10.0,3,2,1,2
1015425,3,1,1,1,2,2.0,3,1,1,2
1016277,6,8,8,1,3,4.0,3,7,1,2
1017023,4,1,1,3,2,1.0,3,1,1,2


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,4.442167,3.150805,3.215227,2.830161,3.234261,3.544656,3.445095,2.869693,1.603221,2.699854
std,2.820761,3.065145,2.988581,2.864562,2.223085,3.643857,2.449697,3.052666,1.732674,0.954592
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [3]:
X = data.iloc[:, :-1].values.copy()
y = data.iloc[:, -1].values.copy()
N, D = X.shape
y[y == 2] = -1
y[y == 4] = 1
display(X.shape, y.shape, y.mean())

(683, 9)

(683,)

-0.3001464128843338

In [6]:
proc_data = data.copy()
proc_data.iloc[:, -1] = y
proc_data.to_csv('data/proc_breast.csv')

In [4]:
rbf_init, rbf_end = -3, 7
ply_init, ply_end = 1, 4

kernel_attrs = [('rbf', 'all', i) for i in range(rbf_init, rbf_end)]
kernels = [lambda A, B: gauss_kernel(A, B, 2**i)
           for i in range(rbf_init, rbf_end)]

kernel_attrs += [('poly', 'all', i) for i in range(ply_init, ply_end)]
kernels += [lambda A, B: poly_kernel(A, B, 1, i)
            for i in range(ply_init, ply_end)]

kernel_attrs += [('rbf', j, i) for i in range(rbf_init, rbf_end)
                 for j in range(D)]
kernels += [lambda A, B: gauss_kernel(A[:, j:j+1], B[:, j:j+1], 2**i)
            for i in range(rbf_init, rbf_end) for j in range(D)]

kernel_attrs += [('poly', j, i) for i in range(ply_init, ply_end)
                 for j in range(D)]
kernels += [lambda A, B: poly_kernel(A[:, j:j+1], B[:, j:j+1], 1, i)
            for i in range(ply_init, ply_end) for j in range(D)]
len(kernels), len(kernel_attrs)

(130, 130)

In [5]:
X_train, X_test, y_train, y_test =\
        train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

In [6]:
normalizer = Normalizer()
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.fit_transform(X_test)

# Baseline Model

In [7]:
max_iter = 200
base_model = BEMKL(kernels=kernels, hyp_lambda_alpha=1, hyp_lambda_beta=1,
                   hyp_gamma_alpha=1, hyp_gamma_beta=1,
                   hyp_omega_alpha=1, hyp_omega_beta=1,
                   e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                   filter_kernels=False, filter_sv=False, verbose=False,
                   max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
base_model = make_pipeline(Normalizer(), base_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
base_cv_results = cross_validate(base_model, X, y, cv=folds, scoring=scoring)
base_stats = deepcopy(scoring.stats)
base_cv_results, base_stats

0 - Kernels: 130/130 (1.0). SV: 467/512 (0.912109375). Mean e: -0.1842. Median e: -0.3545. Std e: 0.2618. 
1 - Kernels: 130/130 (1.0). SV: 471/512 (0.919921875). Mean e: -0.2188. Median e: -0.4046. Std e: 0.2816. 
2 - Kernels: 130/130 (1.0). SV: 485/512 (0.947265625). Mean e: -0.2149. Median e: -0.4265. Std e: 0.3321. 
3 - Kernels: 130/130 (1.0). SV: 487/513 (0.949317738791423). Mean e: -0.1755. Median e: -0.3478. Std e: 0.2680. 
4 - Kernels: 130/130 (1.0). SV: 434/512 (0.84765625). Mean e: -0.1608. Median e: -0.3241. Std e: 0.2598. 
5 - Kernels: 130/130 (1.0). SV: 492/512 (0.9609375). Mean e: -0.2355. Median e: -0.4474. Std e: 0.3207. 
6 - Kernels: 130/130 (1.0). SV: 455/512 (0.888671875). Mean e: -0.1744. Median e: -0.3497. Std e: 0.2736. 
7 - Kernels: 130/130 (1.0). SV: 476/513 (0.9278752436647173). Mean e: -0.2053. Median e: -0.3836. Std e: 0.2704. 
8 - Kernels: 130/130 (1.0). SV: 488/512 (0.953125). Mean e: -0.1381. Median e: -0.2866. Std e: 0.2340. 
9 - Kernels: 130/130 (1.0). SV



({'fit_time': array([11.9125, 11.6175, 11.4629, 11.0654, 11.5519, 11.7453, 11.8981, 11.3931, 11.4116, 11.4139,
         11.9057, 11.3183, 11.672 , 11.7199, 11.3128, 11.0234, 12.8237, 11.9378, 11.3496, 11.0925]),
  'score_time': array([2.1283, 2.0941, 2.1062, 2.1667, 2.134 , 2.068 , 2.0936, 2.1055, 2.107 , 2.161 , 2.1517,
         2.1289, 2.1153, 2.1443, 2.0984, 2.0953, 2.103 , 2.1131, 2.2114, 2.1347]),
  'test_score': array([0.8947, 0.883 , 0.883 , 0.8588, 0.8772, 0.883 , 0.848 , 0.8824, 0.8713, 0.9064, 0.8713,
         0.8824, 0.8713, 0.8772, 0.848 , 0.9118, 0.9006, 0.8655, 0.8538, 0.8941]),
  'train_score': array([0.9102, 0.8906, 0.8887, 0.9045, 0.8926, 0.9004, 0.9102, 0.9045, 0.9004, 0.8926, 0.9043,
         0.8908, 0.9023, 0.8984, 0.9004, 0.8733, 0.8945, 0.8945, 0.9062, 0.9045])},
 [{'elapsed_time': 10.4588463306427,
   'nr_kernels_used': 130,
   'nr_sv_used': 467,
   'total_kernels': 130,
   'total_sv': 512},
  {'elapsed_time': 10.359929084777832,
   'nr_kernels_used': 130,
   'nr

In [8]:
base_times = np.array([s['elapsed_time'] for s in base_stats])
base_kernels = np.array([s['nr_kernels_used'] for s in base_stats])
base_sv = np.array([s['nr_sv_used'] for s in base_stats])
print(
    f"Score: {base_cv_results['test_score'].mean()} +- {base_cv_results['test_score'].std()}\n"
    f"Time: {base_times.mean()} +- {base_times.std()}\n"
    f"Kernels: {base_kernels.mean()} +- {base_kernels.std()}\n"
    f"SVs: {base_sv.mean()} +- {base_sv.std()}"
)

Score: 0.8781957344341246 +- 0.01755169993840917
Time: 10.289134073257447 +- 0.42022597515818316
Kernels: 130.0 +- 0.0
SVs: 471.55 +- 16.67175755581876


# Kernel-sparse model

In [9]:
max_iter = 200
ksparse_model = BEMKL(kernels=kernels, hyp_lambda_alpha=1, hyp_lambda_beta=1,
                      hyp_gamma_alpha=1, hyp_gamma_beta=1,
                      hyp_omega_alpha=1e-11, hyp_omega_beta=1e9,
                      e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                      filter_kernels=False, filter_sv=False, verbose=False,
                      max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
ksparse_pipeline = make_pipeline(Normalizer(), ksparse_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
ksparse_cv_results = cross_validate(ksparse_pipeline, X, y, cv=folds, scoring=scoring)
ksparse_stats = deepcopy(scoring.stats)
ksparse_cv_results, ksparse_stats

0 - Kernels: 38/130 (0.2923076923076923). SV: 480/512 (0.9375). Mean e: 0.0404. Median e: -0.0014. Std e: 0.1278. 
1 - Kernels: 38/130 (0.2923076923076923). SV: 468/512 (0.9140625). Mean e: 0.0323. Median e: -0.0012. Std e: 0.1077. 
2 - Kernels: 40/130 (0.3076923076923077). SV: 477/512 (0.931640625). Mean e: 0.0294. Median e: -0.0013. Std e: 0.1014. 
3 - Kernels: 38/130 (0.2923076923076923). SV: 490/513 (0.9551656920077972). Mean e: 0.0265. Median e: -0.0051. Std e: 0.1102. 
4 - Kernels: 38/130 (0.2923076923076923). SV: 476/512 (0.9296875). Mean e: 0.0287. Median e: -0.0013. Std e: 0.1041. 
5 - Kernels: 40/130 (0.3076923076923077). SV: 462/512 (0.90234375). Mean e: 0.0352. Median e: -0.0040. Std e: 0.1243. 
6 - Kernels: 13/130 (0.1). SV: 442/512 (0.86328125). Mean e: 0.0630. Median e: -0.0015. Std e: 0.2040. 
7 - Kernels: 38/130 (0.2923076923076923). SV: 468/513 (0.9122807017543859). Mean e: 0.0287. Median e: -0.0017. Std e: 0.1043. 
8 - Kernels: 11/130 (0.08461538461538462). SV: 444/5



({'fit_time': array([11.397 , 11.1542, 12.0183, 11.5042, 11.1638, 11.9395, 11.7298, 11.3691, 11.6279, 11.6023,
         11.5696, 11.2167, 11.9341, 11.5783, 11.7232, 11.5739, 11.7426, 11.3982, 11.6216, 11.3578]),
  'score_time': array([2.0922, 2.098 , 2.113 , 2.1004, 2.1235, 2.1352, 2.112 , 2.1186, 2.1559, 2.1319, 2.1172,
         2.0958, 2.1581, 2.1016, 2.1111, 2.1016, 2.1211, 2.1184, 2.0903, 2.1296]),
  'test_score': array([0.883 , 0.8889, 0.8246, 0.8588, 0.8772, 0.8655, 0.8655, 0.8765, 0.8772, 0.8421, 0.8596,
         0.8941, 0.8772, 0.848 , 0.8772, 0.8118, 0.9006, 0.8596, 0.8187, 0.8647]),
  'train_score': array([0.8809, 0.877 , 0.8867, 0.9006, 0.8887, 0.873 , 0.8789, 0.883 , 0.8809, 0.9043, 0.8906,
         0.8908, 0.877 , 0.8906, 0.8945, 0.8889, 0.8691, 0.8965, 0.8984, 0.8869])},
 [{'elapsed_time': 10.13612151145935,
   'nr_kernels_used': 38,
   'nr_sv_used': 480,
   'total_kernels': 130,
   'total_sv': 512},
  {'elapsed_time': 9.88598346710205,
   'nr_kernels_used': 38,
   'nr_sv

In [10]:
ksparse_times = np.array([s['elapsed_time'] for s in ksparse_stats])
ksparse_kernels = np.array([s['nr_kernels_used'] for s in ksparse_stats])
ksparse_sv = np.array([s['nr_sv_used'] for s in ksparse_stats])
print(
    f"Score: {ksparse_cv_results['test_score'].mean()} +- {ksparse_cv_results['test_score'].std()}\n"
    f"Time: {ksparse_times.mean()} +- {ksparse_times.std()}\n"
    f"Kernels: {ksparse_kernels.mean()} +- {ksparse_kernels.std()}\n"
    f"SVs: {ksparse_sv.mean()} +- {ksparse_sv.std()}"
)

Score: 0.8635397316821465 +- 0.02368158331084089
Time: 10.280854141712188 +- 0.26436931430063904
Kernels: 36.0 +- 8.056053624449133
SVs: 466.35 +- 19.000723670428975


# SV-sparse model

In [11]:
max_iter = 200
ssparse_model = BEMKL(kernels=kernels,
                      hyp_lambda_alpha=1e-11, hyp_lambda_beta=1e9,
                      hyp_gamma_alpha=1, hyp_gamma_beta=1,
                      hyp_omega_alpha=1, hyp_omega_beta=1,
                      e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                      filter_kernels=False, filter_sv=False, verbose=False,
                      max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
ssparse_pipeline = make_pipeline(Normalizer(), ssparse_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
ssparse_cv_results = cross_validate(ssparse_pipeline, X, y, cv=folds, scoring=scoring)
ssparse_stats = deepcopy(scoring.stats)
ssparse_cv_results, ssparse_stats

0 - Kernels: 130/130 (1.0). SV: 284/512 (0.5546875). Mean e: -0.2279. Median e: -0.4163. Std e: 0.2841. 
1 - Kernels: 130/130 (1.0). SV: 399/512 (0.779296875). Mean e: -0.1738. Median e: -0.3342. Std e: 0.2500. 
2 - Kernels: 130/130 (1.0). SV: 332/512 (0.6484375). Mean e: -0.1624. Median e: -0.3185. Std e: 0.2454. 
3 - Kernels: 130/130 (1.0). SV: 428/513 (0.834307992202729). Mean e: -0.2176. Median e: -0.4090. Std e: 0.2890. 
4 - Kernels: 130/130 (1.0). SV: 379/512 (0.740234375). Mean e: -0.1805. Median e: -0.3555. Std e: 0.2723. 
5 - Kernels: 130/130 (1.0). SV: 362/512 (0.70703125). Mean e: -0.1910. Median e: -0.3670. Std e: 0.2688. 
6 - Kernels: 130/130 (1.0). SV: 311/512 (0.607421875). Mean e: -0.2135. Median e: -0.3983. Std e: 0.2822. 
7 - Kernels: 130/130 (1.0). SV: 348/513 (0.6783625730994152). Mean e: -0.1824. Median e: -0.3529. Std e: 0.2633. 
8 - Kernels: 130/130 (1.0). SV: 268/512 (0.5234375). Mean e: -0.2280. Median e: -0.4257. Std e: 0.3014. 
9 - Kernels: 130/130 (1.0). SV:



({'fit_time': array([11.3086, 12.2444, 11.6782, 11.1164, 11.6044, 11.6513, 11.9292, 11.5406, 11.5624, 16.7891,
         13.2116, 11.262 , 11.7023, 11.5469, 11.6417, 11.1717, 11.7017, 11.2002, 11.7854, 10.9689]),
  'score_time': array([2.1031, 2.1002, 2.1574, 2.1041, 2.2051, 2.1173, 2.1151, 2.19  , 2.1566, 2.2078, 2.2772,
         2.1607, 2.1179, 2.1656, 2.1177, 2.1237, 2.2429, 2.1258, 2.105 , 2.1006]),
  'test_score': array([0.8947, 0.8596, 0.8772, 0.8941, 0.883 , 0.9064, 0.8655, 0.8588, 0.9064, 0.8947, 0.8655,
         0.8941, 0.8538, 0.9064, 0.8596, 0.8941, 0.8713, 0.9064, 0.8713, 0.9176]),
  'train_score': array([0.8984, 0.9004, 0.8926, 0.9084, 0.8984, 0.8848, 0.9004, 0.9064, 0.9004, 0.8945, 0.8945,
         0.8928, 0.9062, 0.8984, 0.8945, 0.9084, 0.8984, 0.9023, 0.916 , 0.8869])},
 [{'elapsed_time': 10.085502862930298,
   'nr_kernels_used': 130,
   'nr_sv_used': 284,
   'total_kernels': 130,
   'total_sv': 512},
  {'elapsed_time': 11.016700983047485,
   'nr_kernels_used': 130,
   '

In [12]:
ssparse_times = np.array([s['elapsed_time'] for s in ssparse_stats])
ssparse_kernels = np.array([s['nr_kernels_used'] for s in ssparse_stats])
ssparse_sv = np.array([s['nr_sv_used'] for s in ssparse_stats])
print(
    f"Score: {ssparse_cv_results['test_score'].mean()} +- {ssparse_cv_results['test_score'].std()}\n"
    f"Time: {ssparse_times.mean()} +- {ssparse_times.std()}\n"
    f"Kernels: {ssparse_kernels.mean()} +- {ssparse_kernels.std()}\n"
    f"SVs: {ssparse_sv.mean()} +- {ssparse_sv.std()}"
)

Score: 0.8840522875816994 +- 0.01932183394676173
Time: 10.584677577018738 +- 1.2125128104995782
Kernels: 130.0 +- 0.0
SVs: 333.45 +- 59.37716311849194


In [13]:
import json
with open('breast_results.json', 'w') as fp:
    json.dump(
        {
            'ksparse': {
                'scores': list(ksparse_cv_results['test_score']),
                'times': list(ksparse_times),
                'kernels': [int(k) for k in ksparse_kernels],
                'svs': [int(s) for s in ksparse_sv],
            },
            'ssparse': {
                'scores': list(ssparse_cv_results['test_score']),
                'times': list(ssparse_times),
                'kernels': [int(k) for k in ksparse_kernels],
                'svs': [int(s) for s in ssparse_sv],
            },
            'base': {
                'scores': list(base_cv_results['test_score']),
                'times': list(base_times),
                'kernels': [int(k) for k in base_kernels],
                'svs': [int(s) for s in base_sv],
            },
            'total_kernels': len(kernels),
            'total_sv': len(X_train),
        },
        fp,
        indent=4,
        sort_keys=True
    )