In [1]:
%load_ext autoreload
%autoreload 2
%pylab inline

from copy import deepcopy
import itertools
import pickle

import pandas as pd
from scipy.io import savemat, loadmat
import seaborn as sns
from sklearn.metrics import (log_loss, mean_squared_error, roc_curve, auc,
                             precision_recall_fscore_support, confusion_matrix)
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate
from tqdm import tqdm, trange, tqdm_notebook as tqdmn

from BEKML import BEMKL, plot_distplot
from utils import poly_kernel, gauss_kernel, scoring, plot_kernel_importances, plot_compare_models

sns.set(style='ticks', context='talk')
np.set_printoptions(precision=4, linewidth=100)

Populating the interactive namespace from numpy and matplotlib


  return f(*args, **kwds)


In [2]:
data = pd.read_csv('data/breast.csv', names=list(range(11)), index_col=0, na_values='?')
display(data.describe())
data = data.dropna()
display(data.head())
data.describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
count,699.0,699.0,699.0,699.0,699.0,683.0,699.0,699.0,699.0,699.0
mean,4.41774,3.134478,3.207439,2.806867,3.216023,3.544656,3.437768,2.866953,1.589413,2.689557
std,2.815741,3.051459,2.971913,2.855379,2.2143,3.643857,2.438364,3.053634,1.715078,0.951273
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1.0,3,1,1,2
1002945,5,4,4,5,7,10.0,3,2,1,2
1015425,3,1,1,1,2,2.0,3,1,1,2
1016277,6,8,8,1,3,4.0,3,7,1,2
1017023,4,1,1,3,2,1.0,3,1,1,2


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,4.442167,3.150805,3.215227,2.830161,3.234261,3.544656,3.445095,2.869693,1.603221,2.699854
std,2.820761,3.065145,2.988581,2.864562,2.223085,3.643857,2.449697,3.052666,1.732674,0.954592
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [3]:
X = data.iloc[:, :-1].values.copy()
y = data.iloc[:, -1].values.copy()
N, D = X.shape
y[y == 2] = -1
y[y == 4] = 1
display(X.shape, y.shape, y.mean())

(683, 9)

(683,)

-0.3001464128843338

In [4]:
proc_data = data.copy()
proc_data.iloc[:, -1] = y
proc_data.to_csv('data/proc_breast.csv')

In [5]:
rbf_init, rbf_end = -3, 7
ply_init, ply_end = 1, 4

kernel_attrs = [('rbf', 'all', i) for i in range(rbf_init, rbf_end)]
kernels = [lambda A, B: gauss_kernel(A, B, 2**i)
           for i in range(rbf_init, rbf_end)]

kernel_attrs += [('poly', 'all', i) for i in range(ply_init, ply_end)]
kernels += [lambda A, B: poly_kernel(A, B, 1, i)
            for i in range(ply_init, ply_end)]

kernel_attrs += [('rbf', j, i) for i in range(rbf_init, rbf_end)
                 for j in range(D)]
kernels += [lambda A, B: gauss_kernel(A[:, j:j+1], B[:, j:j+1], 2**i)
            for i in range(rbf_init, rbf_end) for j in range(D)]

kernel_attrs += [('poly', j, i) for i in range(ply_init, ply_end)
                 for j in range(D)]
kernels += [lambda A, B: poly_kernel(A[:, j:j+1], B[:, j:j+1], 1, i)
            for i in range(ply_init, ply_end) for j in range(D)]
len(kernels), len(kernel_attrs)

(130, 130)

In [6]:
X_train, X_test, y_train, y_test =\
        train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

In [7]:
normalizer = Normalizer()
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.fit_transform(X_test)

# Baseline Model

In [8]:
max_iter = 200
base_model = BEMKL(kernels=kernels, hyp_lambda_alpha=1, hyp_lambda_beta=1,
                   hyp_gamma_alpha=1, hyp_gamma_beta=1,
                   hyp_omega_alpha=1, hyp_omega_beta=1,
                   e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                   filter_kernels=False, filter_sv=False, verbose=False,
                   max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
base_model = make_pipeline(Normalizer(), base_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
base_cv_results = cross_validate(base_model, X, y, cv=folds, scoring=scoring)
base_stats = deepcopy(scoring.stats)
base_cv_results, base_stats

0 - Kernels: 130/130 (1.0). SV: 465/512 (0.908203125). Mean e: -0.1890. Median e: -0.3572. Std e: 0.2556. 
1 - Kernels: 130/130 (1.0). SV: 452/512 (0.8828125). Mean e: -0.2320. Median e: -0.4371. Std e: 0.3125. 
2 - Kernels: 130/130 (1.0). SV: 463/512 (0.904296875). Mean e: -0.1722. Median e: -0.3568. Std e: 0.2912. 
3 - Kernels: 130/130 (1.0). SV: 439/513 (0.8557504873294347). Mean e: -0.1962. Median e: -0.3760. Std e: 0.2768. 
4 - Kernels: 130/130 (1.0). SV: 417/512 (0.814453125). Mean e: -0.1455. Median e: -0.2964. Std e: 0.2352. 
5 - Kernels: 130/130 (1.0). SV: 482/512 (0.94140625). Mean e: -0.2338. Median e: -0.4449. Std e: 0.3196. 
6 - Kernels: 130/130 (1.0). SV: 473/512 (0.923828125). Mean e: -0.2200. Median e: -0.4194. Std e: 0.3058. 
7 - Kernels: 130/130 (1.0). SV: 458/513 (0.8927875243664717). Mean e: -0.2050. Median e: -0.3898. Std e: 0.2829. 
8 - Kernels: 130/130 (1.0). SV: 433/512 (0.845703125). Mean e: -0.2044. Median e: -0.3991. Std e: 0.3016. 
9 - Kernels: 130/130 (1.0)



({'fit_time': array([11.7495, 12.1171, 11.3911, 11.7789, 11.9982, 11.4452, 11.5278, 11.5245, 12.2106, 11.7268,
         12.0406, 11.6262, 11.8252, 11.4632, 11.4386, 12.0428, 11.8182, 11.5431, 12.4889, 11.4563]),
  'score_time': array([2.1448, 2.1478, 2.2019, 2.1854, 2.2046, 2.1691, 2.1671, 2.1767, 2.1899, 2.193 , 2.1849,
         2.1749, 2.2051, 2.2072, 2.1778, 2.1821, 2.2643, 2.165 , 2.1726, 2.1871]),
  'test_score': array([0.883 , 0.8304, 0.9006, 0.8588, 0.8655, 0.8947, 0.9181, 0.8294, 0.9181, 0.8596, 0.8596,
         0.8647, 0.8947, 0.8363, 0.924 , 0.8235, 0.8889, 0.8772, 0.8713, 0.8706]),
  'train_score': array([0.8848, 0.9102, 0.8984, 0.9064, 0.8965, 0.8984, 0.8887, 0.9142, 0.8984, 0.9004, 0.9043,
         0.9084, 0.8926, 0.8945, 0.9004, 0.8986, 0.8906, 0.8906, 0.8984, 0.8967])},
 [{'elapsed_time': 10.345966339111328,
   'nr_kernels_used': 130,
   'nr_sv_used': 465,
   'total_kernels': 130,
   'total_sv': 512},
  {'elapsed_time': 10.836859703063965,
   'nr_kernels_used': 130,
   '

In [9]:
base_times = np.array([s['elapsed_time'] for s in base_stats])
base_kernels = np.array([s['nr_kernels_used'] for s in base_stats])
base_sv = np.array([s['nr_sv_used'] for s in base_stats])
print(
    f"Score: {base_cv_results['test_score'].mean()} +- {base_cv_results['test_score'].std()}\n"
    f"Time: {base_times.mean()} +- {base_times.std()}\n"
    f"Kernels: {base_kernels.mean()} +- {base_kernels.std()}\n"
    f"SVs: {base_sv.mean()} +- {base_sv.std()}"
)

Score: 0.8734640522875818 +- 0.02910895566047288
Time: 10.4528214097023 +- 0.3090853539006856
Kernels: 130.0 +- 0.0
SVs: 461.45 +- 20.095957304890952


# Kernel-sparse model

In [10]:
max_iter = 200
ksparse_model = BEMKL(kernels=kernels, hyp_lambda_alpha=1, hyp_lambda_beta=1,
                      hyp_gamma_alpha=1, hyp_gamma_beta=1,
                      hyp_omega_alpha=1e-11, hyp_omega_beta=1e9,
                      e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                      filter_kernels=False, filter_sv=False, verbose=False,
                      max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
ksparse_pipeline = make_pipeline(Normalizer(), ksparse_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
ksparse_cv_results = cross_validate(ksparse_pipeline, X, y, cv=folds, scoring=scoring)
ksparse_stats = deepcopy(scoring.stats)
ksparse_cv_results, ksparse_stats

0 - Kernels: 38/130 (0.2923076923076923). SV: 473/512 (0.923828125). Mean e: 0.0299. Median e: -0.0012. Std e: 0.1066. 
1 - Kernels: 38/130 (0.2923076923076923). SV: 442/512 (0.86328125). Mean e: 0.0438. Median e: -0.0073. Std e: 0.1524. 
2 - Kernels: 40/130 (0.3076923076923077). SV: 490/512 (0.95703125). Mean e: 0.0336. Median e: -0.0015. Std e: 0.1070. 
3 - Kernels: 40/130 (0.3076923076923077). SV: 459/513 (0.8947368421052632). Mean e: 0.0308. Median e: -0.0017. Std e: 0.0952. 
4 - Kernels: 38/130 (0.2923076923076923). SV: 465/512 (0.908203125). Mean e: 0.0342. Median e: -0.0012. Std e: 0.1258. 
5 - Kernels: 38/130 (0.2923076923076923). SV: 474/512 (0.92578125). Mean e: 0.0360. Median e: -0.0016. Std e: 0.1202. 
6 - Kernels: 38/130 (0.2923076923076923). SV: 452/512 (0.8828125). Mean e: 0.0261. Median e: -0.0034. Std e: 0.1025. 
7 - Kernels: 40/130 (0.3076923076923077). SV: 494/513 (0.9629629629629629). Mean e: 0.0414. Median e: -0.0019. Std e: 0.1174. 
8 - Kernels: 11/130 (0.08461538



({'fit_time': array([11.5221, 11.6689, 11.5318, 11.7387, 12.2946, 11.4788, 11.8728, 11.5334, 11.4275, 13.479 ,
         12.1367, 11.552 , 12.1205, 11.6674, 11.7014, 11.895 , 11.6324, 11.7805, 11.5802, 11.4956]),
  'score_time': array([2.1831, 2.1232, 2.1996, 2.1868, 2.1791, 2.1853, 2.1519, 2.1701, 2.2201, 2.1908, 2.1837,
         2.1909, 2.1879, 2.1534, 2.2128, 2.1837, 2.2088, 2.1684, 2.1906, 2.1942]),
  'test_score': array([0.8947, 0.8713, 0.8655, 0.8412, 0.8655, 0.8538, 0.8538, 0.8941, 0.8713, 0.8246, 0.8772,
         0.8706, 0.9123, 0.8538, 0.8246, 0.8647, 0.8421, 0.807 , 0.8596, 0.8765]),
  'train_score': array([0.8926, 0.8789, 0.8906, 0.8889, 0.8867, 0.8887, 0.8906, 0.9025, 0.8711, 0.9043, 0.8926,
         0.8889, 0.877 , 0.8926, 0.8945, 0.885 , 0.8887, 0.8906, 0.8984, 0.8928])},
 [{'elapsed_time': 10.285613298416138,
   'nr_kernels_used': 38,
   'nr_sv_used': 473,
   'total_kernels': 130,
   'total_sv': 512},
  {'elapsed_time': 10.426041841506958,
   'nr_kernels_used': 38,
   'nr

In [11]:
ksparse_times = np.array([s['elapsed_time'] for s in ksparse_stats])
ksparse_kernels = np.array([s['nr_kernels_used'] for s in ksparse_stats])
ksparse_sv = np.array([s['nr_sv_used'] for s in ksparse_stats])
print(
    f"Score: {ksparse_cv_results['test_score'].mean()} +- {ksparse_cv_results['test_score'].std()}\n"
    f"Time: {ksparse_times.mean()} +- {ksparse_times.std()}\n"
    f"Kernels: {ksparse_kernels.mean()} +- {ksparse_kernels.std()}\n"
    f"SVs: {ksparse_sv.mean()} +- {ksparse_sv.std()}"
)

Score: 0.8612125902992774 +- 0.024700598913248353
Time: 10.496105480194093 +- 0.46087263632150594
Kernels: 35.65 +- 8.162566998193645
SVs: 470.4 +- 17.036431551237484


# SV-sparse model

In [12]:
max_iter = 200
ssparse_model = BEMKL(kernels=kernels,
                      hyp_lambda_alpha=1e-11, hyp_lambda_beta=1e9,
                      hyp_gamma_alpha=1, hyp_gamma_beta=1,
                      hyp_omega_alpha=1, hyp_omega_beta=1,
                      e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                      filter_kernels=False, filter_sv=False, verbose=False,
                      max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
ssparse_pipeline = make_pipeline(Normalizer(), ssparse_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
ssparse_cv_results = cross_validate(ssparse_pipeline, X, y, cv=folds, scoring=scoring)
ssparse_stats = deepcopy(scoring.stats)
ssparse_cv_results, ssparse_stats

0 - Kernels: 130/130 (1.0). SV: 368/512 (0.71875). Mean e: -0.1898. Median e: -0.3622. Std e: 0.2647. 
1 - Kernels: 130/130 (1.0). SV: 349/512 (0.681640625). Mean e: -0.2153. Median e: -0.4076. Std e: 0.2914. 
2 - Kernels: 130/130 (1.0). SV: 276/512 (0.5390625). Mean e: -0.1904. Median e: -0.3553. Std e: 0.2515. 
3 - Kernels: 130/130 (1.0). SV: 407/513 (0.7933723196881092). Mean e: -0.1944. Median e: -0.3799. Std e: 0.2912. 
4 - Kernels: 130/130 (1.0). SV: 318/512 (0.62109375). Mean e: -0.1963. Median e: -0.3668. Std e: 0.2600. 
5 - Kernels: 130/130 (1.0). SV: 350/512 (0.68359375). Mean e: -0.1978. Median e: -0.3713. Std e: 0.2657. 
6 - Kernels: 130/130 (1.0). SV: 318/512 (0.62109375). Mean e: -0.1779. Median e: -0.3397. Std e: 0.2530. 
7 - Kernels: 130/130 (1.0). SV: 356/513 (0.6939571150097466). Mean e: -0.1885. Median e: -0.3576. Std e: 0.2587. 
8 - Kernels: 130/130 (1.0). SV: 381/512 (0.744140625). Mean e: -0.2081. Median e: -0.3984. Std e: 0.2939. 
9 - Kernels: 130/130 (1.0). SV: 



({'fit_time': array([12.0188, 11.939 , 12.343 , 11.7283, 11.7841, 11.4979, 12.0337, 11.6015, 11.4396, 11.7974,
         11.429 , 11.5803, 11.972 , 12.5936, 11.6243, 11.6443, 11.473 , 12.2484, 12.3794, 11.9563]),
  'score_time': array([2.1963, 2.1903, 2.1862, 2.147 , 2.196 , 2.1571, 2.1662, 2.1926, 2.1897, 2.1847, 2.2269,
         2.1815, 2.1371, 2.2862, 2.1561, 2.1911, 2.1707, 2.1454, 2.2134, 2.177 ]),
  'test_score': array([0.9006, 0.924 , 0.8538, 0.8824, 0.8772, 0.8421, 0.8772, 0.9294, 0.8772, 0.8947, 0.9123,
         0.8471, 0.848 , 0.8772, 0.883 , 0.9235, 0.9006, 0.883 , 0.8889, 0.8824]),
  'train_score': array([0.9023, 0.9023, 0.9082, 0.8967, 0.8965, 0.9375, 0.8887, 0.8928, 0.8984, 0.8926, 0.9043,
         0.9025, 0.9121, 0.8945, 0.8965, 0.8791, 0.8887, 0.9023, 0.8984, 0.9064])},
 [{'elapsed_time': 10.73509955406189,
   'nr_kernels_used': 130,
   'nr_sv_used': 368,
   'total_kernels': 130,
   'total_sv': 512},
  {'elapsed_time': 10.660489559173584,
   'nr_kernels_used': 130,
   'n

In [13]:
ssparse_times = np.array([s['elapsed_time'] for s in ssparse_stats])
ssparse_kernels = np.array([s['nr_kernels_used'] for s in ssparse_stats])
ssparse_sv = np.array([s['nr_sv_used'] for s in ssparse_stats])
print(
    f"Score: {ssparse_cv_results['test_score'].mean()} +- {ssparse_cv_results['test_score'].std()}\n"
    f"Time: {ssparse_times.mean()} +- {ssparse_times.std()}\n"
    f"Kernels: {ssparse_kernels.mean()} +- {ssparse_kernels.std()}\n"
    f"SVs: {ssparse_sv.mean()} +- {ssparse_sv.std()}"
)

Score: 0.8852235982112144 +- 0.024704093509674938
Time: 10.55363906621933 +- 0.34097197672872154
Kernels: 130.0 +- 0.0
SVs: 334.8 +- 54.903187521308816


In [14]:
import json
with open('breast_results.json', 'w') as fp:
    json.dump(
        {
            'ksparse': {
                'scores': list(ksparse_cv_results['test_score']),
                'times': list(ksparse_times),
                'kernels': [int(k) for k in ksparse_kernels],
                'svs': [int(s) for s in ksparse_sv],
            },
            'ssparse': {
                'scores': list(ssparse_cv_results['test_score']),
                'times': list(ssparse_times),
                'kernels': [int(k) for k in ssparse_kernels],
                'svs': [int(s) for s in ssparse_sv],
            },
            'base': {
                'scores': list(base_cv_results['test_score']),
                'times': list(base_times),
                'kernels': [int(k) for k in base_kernels],
                'svs': [int(s) for s in base_sv],
            },
            'total_kernels': len(kernels),
            'total_sv': len(X_train),
        },
        fp,
        indent=4,
        sort_keys=True
    )