In [1]:
%load_ext autoreload
%autoreload 2
%pylab inline

from copy import deepcopy
import itertools
import pickle

import pandas as pd
from scipy.io import savemat, loadmat
import seaborn as sns
from sklearn.metrics import (log_loss, mean_squared_error, roc_curve, auc,
                             precision_recall_fscore_support, confusion_matrix)
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate
from tqdm import tqdm, trange, tqdm_notebook as tqdmn

from BEKML import BEMKL, plot_distplot
from utils import poly_kernel, gauss_kernel, scoring, plot_kernel_importances, plot_compare_models

sns.set(style='ticks', context='talk')
np.set_printoptions(precision=4, linewidth=100)

Populating the interactive namespace from numpy and matplotlib


  return f(*args, **kwds)


In [2]:
dtypes = {i: float for i in range(35)}
dtypes[35] = str
data = pd.read_csv('data/ionosphere.csv', names=list(range(35)))#, dtype=dtypes)
data.loc[data.loc[:, 34] == 'g', 34] = 1
data.loc[data.loc[:, 34] == 'b', 34] = -1
data.loc[:, 34] = data.loc[:, 34].astype(int)
display(data.head())
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,1
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,-1
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,1
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,-1
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,1


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
count,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
mean,0.891738,0.0,0.641342,0.044372,0.601068,0.115889,0.550095,0.11936,0.511848,0.181345,...,-0.071187,0.541641,-0.069538,0.378445,-0.027907,0.352514,-0.003794,0.349364,0.01448,0.282051
std,0.311155,0.0,0.497708,0.441435,0.519862,0.46081,0.492654,0.52075,0.507066,0.483851,...,0.508495,0.516205,0.550025,0.575886,0.507974,0.571483,0.513574,0.522663,0.468337,0.960769
min,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,1.0,0.0,0.472135,-0.064735,0.41266,-0.024795,0.21131,-0.05484,0.08711,-0.048075,...,-0.33239,0.286435,-0.443165,0.0,-0.236885,0.0,-0.242595,0.0,-0.16535,-1.0
50%,1.0,0.0,0.87111,0.01631,0.8092,0.0228,0.72873,0.01471,0.68421,0.01829,...,-0.01505,0.70824,-0.01769,0.49664,0.0,0.44277,0.0,0.40956,0.0,1.0
75%,1.0,0.0,1.0,0.194185,1.0,0.334655,0.96924,0.445675,0.95324,0.534195,...,0.156765,0.999945,0.153535,0.883465,0.154075,0.85762,0.20012,0.813765,0.17166,1.0
max,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
X = data.iloc[:, :-1].copy()
y = data.iloc[:, -1].copy()
N, D = X.shape
display(X.shape, y.shape, y.mean())

(351, 34)

(351,)

0.28205128205128205

In [4]:
proc_data = data.copy()
proc_data.iloc[:, -1] = y
proc_data.to_csv('data/proc_ionosphere.csv')

In [5]:
rbf_init, rbf_end = -3, 7
ply_init, ply_end = 1, 4

kernel_attrs = [('rbf', 'all', i) for i in range(rbf_init, rbf_end)]
kernels = [lambda A, B: gauss_kernel(A, B, 2**i)
           for i in range(rbf_init, rbf_end)]

kernel_attrs += [('poly', 'all', i) for i in range(ply_init, ply_end)]
kernels += [lambda A, B: poly_kernel(A, B, 1, i)
            for i in range(ply_init, ply_end)]

kernel_attrs += [('rbf', j, i) for i in range(rbf_init, rbf_end)
                 for j in range(D)]
kernels += [lambda A, B: gauss_kernel(A[:, j:j+1], B[:, j:j+1], 2**i)
            for i in range(rbf_init, rbf_end) for j in range(D)]

kernel_attrs += [('poly', j, i) for i in range(ply_init, ply_end)
                 for j in range(D)]
kernels += [lambda A, B: poly_kernel(A[:, j:j+1], B[:, j:j+1], 1, i)
            for i in range(ply_init, ply_end) for j in range(D)]
len(kernels), len(kernel_attrs)

(455, 455)

In [6]:
X_train, X_test, y_train, y_test =\
        train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

In [7]:
normalizer = Normalizer()
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.fit_transform(X_test)

# Baseline Model

In [8]:
max_iter = 200
base_model = BEMKL(kernels=kernels, hyp_lambda_alpha=1, hyp_lambda_beta=1,
                   hyp_gamma_alpha=1, hyp_gamma_beta=1,
                   hyp_omega_alpha=1, hyp_omega_beta=1,
                   e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                   filter_kernels=False, filter_sv=False, verbose=False,
                   max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
base_model = make_pipeline(Normalizer(), base_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
base_cv_results = cross_validate(base_model, X, y, cv=folds, scoring=scoring)
base_stats = deepcopy(scoring.stats)
base_cv_results, base_stats

0 - Kernels: 455/455 (1.0). SV: 257/262 (0.9809160305343512). Mean e: -0.0022. Median e: -0.0227. Std e: 0.1192. 
1 - Kernels: 455/455 (1.0). SV: 253/263 (0.9619771863117871). Mean e: -0.0021. Median e: -0.0184. Std e: 0.1042. 
2 - Kernels: 455/455 (1.0). SV: 259/264 (0.9810606060606061). Mean e: -0.0072. Median e: -0.0248. Std e: 0.1163. 
3 - Kernels: 455/455 (1.0). SV: 255/264 (0.9659090909090909). Mean e: -0.0031. Median e: -0.0240. Std e: 0.1167. 
4 - Kernels: 455/455 (1.0). SV: 254/262 (0.9694656488549618). Mean e: 0.0013. Median e: -0.0227. Std e: 0.1218. 
5 - Kernels: 455/455 (1.0). SV: 246/263 (0.935361216730038). Mean e: -0.0035. Median e: -0.0187. Std e: 0.1035. 
6 - Kernels: 455/455 (1.0). SV: 254/264 (0.9621212121212122). Mean e: -0.0060. Median e: -0.0229. Std e: 0.1108. 
7 - Kernels: 455/455 (1.0). SV: 258/264 (0.9772727272727273). Mean e: -0.0073. Median e: -0.0249. Std e: 0.1132. 
8 - Kernels: 455/455 (1.0). SV: 252/262 (0.9618320610687023). Mean e: -0.0058. Median e: -



({'fit_time': array([16.3186, 19.5719, 21.1818, 16.1789, 15.9457, 13.0516, 13.3368, 13.0482, 12.6117, 12.7089,
         13.2758, 13.0589, 13.3763, 12.9491, 12.7268, 12.8185, 12.7006, 13.0645, 12.895 , 12.7197]),
  'score_time': array([1.9763, 1.6714, 1.7425, 1.7386, 1.6494, 1.6661, 1.6496, 1.6596, 1.6697, 1.6744, 1.6673,
         1.6657, 1.6418, 1.6697, 1.6595, 1.6654, 1.6819, 1.6723, 1.6643, 1.6365]),
  'test_score': array([0.9101, 0.8636, 0.908 , 0.908 , 0.9213, 0.8068, 0.908 , 0.8621, 0.8764, 0.8523, 0.908 ,
         0.8506, 0.8315, 0.8864, 0.908 , 0.8621, 0.8652, 0.8636, 0.8851, 0.8736]),
  'train_score': array([0.9885, 0.9772, 0.9811, 0.9811, 0.9809, 0.981 , 0.9848, 0.9811, 0.9809, 0.981 , 0.9735,
         0.9886, 0.9847, 0.9658, 0.9811, 0.9811, 0.9847, 0.9696, 0.9773, 0.9773])},
 [{'elapsed_time': 15.266514301300049,
   'nr_kernels_used': 455,
   'nr_sv_used': 257,
   'total_kernels': 455,
   'total_sv': 262},
  {'elapsed_time': 18.506811141967773,
   'nr_kernels_used': 455,
   '

In [9]:
base_times = np.array([s['elapsed_time'] for s in base_stats])
base_kernels = np.array([s['nr_kernels_used'] for s in base_stats])
base_sv = np.array([s['nr_sv_used'] for s in base_stats])
print(
    f"Score: {base_cv_results['test_score'].mean()} +- {base_cv_results['test_score'].std()}\n"
    f"Time: {base_times.mean()} +- {base_times.std()}\n"
    f"Kernels: {base_kernels.mean()} +- {base_kernels.std()}\n"
    f"SVs: {base_sv.mean()} +- {base_sv.std()}"
)

Score: 0.8775392436570273 +- 0.029405353239401272
Time: 13.122817647457122 +- 2.354564825686717
Kernels: 449.9 +- 22.230384612057435
SVs: 254.65 +- 3.785168424258028


# Kernel-sparse model

In [10]:
max_iter = 200
ksparse_model = BEMKL(kernels=kernels, hyp_lambda_alpha=1, hyp_lambda_beta=1,
                      hyp_gamma_alpha=1, hyp_gamma_beta=1,
                      hyp_omega_alpha=1e-11, hyp_omega_beta=1e9,
                      e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                      filter_kernels=False, filter_sv=False, verbose=False,
                      max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
ksparse_pipeline = make_pipeline(Normalizer(), ksparse_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
ksparse_cv_results = cross_validate(ksparse_pipeline, X, y, cv=folds, scoring=scoring)
ksparse_stats = deepcopy(scoring.stats)
ksparse_cv_results, ksparse_stats

0 - Kernels: 3/455 (0.006593406593406593). SV: 258/262 (0.9847328244274809). Mean e: 0.0089. Median e: -0.0001. Std e: 0.1028. 
1 - Kernels: 3/455 (0.006593406593406593). SV: 249/263 (0.9467680608365019). Mean e: 0.0089. Median e: -0.0002. Std e: 0.1106. 
2 - Kernels: 3/455 (0.006593406593406593). SV: 258/264 (0.9772727272727273). Mean e: 0.0088. Median e: -0.0002. Std e: 0.1078. 
3 - Kernels: 3/455 (0.006593406593406593). SV: 245/264 (0.928030303030303). Mean e: 0.0089. Median e: -0.0002. Std e: 0.1108. 
4 - Kernels: 3/455 (0.006593406593406593). SV: 253/262 (0.9656488549618321). Mean e: 0.0089. Median e: -0.0001. Std e: 0.1004. 
5 - Kernels: 3/455 (0.006593406593406593). SV: 256/263 (0.973384030418251). Mean e: 0.0087. Median e: -0.0002. Std e: 0.1071. 
6 - Kernels: 3/455 (0.006593406593406593). SV: 251/264 (0.9507575757575758). Mean e: 0.0089. Median e: -0.0002. Std e: 0.1123. 
7 - Kernels: 3/455 (0.006593406593406593). SV: 257/264 (0.9734848484848485). Mean e: 0.0092. Median e: -0.



({'fit_time': array([12.7769, 13.1763, 13.2464, 13.2482, 12.5824, 12.6649, 13.0838, 13.3394, 12.8283, 13.1353,
         12.7233, 13.2776, 12.9565, 13.2046, 12.673 , 12.7503, 12.6731, 13.1342, 13.0239, 12.6545]),
  'score_time': array([1.666 , 1.6676, 1.675 , 1.6555, 1.661 , 1.6608, 1.6783, 1.677 , 1.6782, 1.6541, 1.6541,
         1.6765, 1.679 , 1.6827, 1.664 , 1.772 , 1.6981, 1.6735, 1.653 , 1.6889]),
  'test_score': array([0.8652, 0.8409, 0.908 , 0.8621, 0.8539, 0.9205, 0.908 , 0.8506, 0.8202, 0.8523, 0.8966,
         0.9425, 0.8989, 0.9091, 0.931 , 0.8276, 0.9438, 0.8295, 0.8621, 0.8276]),
  'train_score': array([0.9733, 0.9886, 0.9735, 0.9848, 0.9847, 0.981 , 0.9886, 0.9735, 0.9847, 0.9924, 0.9735,
         0.9735, 0.9771, 0.9924, 0.9924, 0.9773, 0.9771, 0.9886, 0.9848, 0.9848])},
 [{'elapsed_time': 11.674834251403809,
   'nr_kernels_used': 3,
   'nr_sv_used': 258,
   'total_kernels': 455,
   'total_sv': 262},
  {'elapsed_time': 12.122445344924927,
   'nr_kernels_used': 3,
   'nr_s

In [11]:
ksparse_times = np.array([s['elapsed_time'] for s in ksparse_stats])
ksparse_kernels = np.array([s['nr_kernels_used'] for s in ksparse_stats])
ksparse_sv = np.array([s['nr_sv_used'] for s in ksparse_stats])
print(
    f"Score: {ksparse_cv_results['test_score'].mean()} +- {ksparse_cv_results['test_score'].std()}\n"
    f"Time: {ksparse_times.mean()} +- {ksparse_times.std()}\n"
    f"Kernels: {ksparse_kernels.mean()} +- {ksparse_kernels.std()}\n"
    f"SVs: {ksparse_sv.mean()} +- {ksparse_sv.std()}"
)

Score: 0.8775193576602914 +- 0.0396667525776768
Time: 11.90651068687439 +- 0.24474368790924794
Kernels: 3.0 +- 0.0
SVs: 255.5 +- 4.031128874149275


# SV-sparse model

In [12]:
max_iter = 200
ssparse_model = BEMKL(kernels=kernels,
                      hyp_lambda_alpha=1e-11, hyp_lambda_beta=1e9,
                      hyp_gamma_alpha=1, hyp_gamma_beta=1,
                      hyp_omega_alpha=1, hyp_omega_beta=1,
                      e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                      filter_kernels=False, filter_sv=False, verbose=False,
                      max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
ssparse_pipeline = make_pipeline(Normalizer(), ssparse_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
ssparse_cv_results = cross_validate(ssparse_pipeline, X, y, cv=folds, scoring=scoring)
ssparse_stats = deepcopy(scoring.stats)
ssparse_cv_results, ssparse_stats

0 - Kernels: 455/455 (1.0). SV: 81/262 (0.30916030534351147). Mean e: 0.0034. Median e: -0.0106. Std e: 0.0898. 
1 - Kernels: 115/455 (0.25274725274725274). SV: 75/263 (0.28517110266159695). Mean e: 0.0098. Median e: -0.0023. Std e: 0.1011. 
2 - Kernels: 13/455 (0.02857142857142857). SV: 61/264 (0.23106060606060605). Mean e: 0.0010. Median e: -0.0067. Std e: 0.0964. 
3 - Kernels: 455/455 (1.0). SV: 84/264 (0.3181818181818182). Mean e: 0.0063. Median e: -0.0127. Std e: 0.1012. 
4 - Kernels: 115/455 (0.25274725274725274). SV: 85/262 (0.3244274809160305). Mean e: 0.0054. Median e: -0.0093. Std e: 0.1042. 
5 - Kernels: 13/455 (0.02857142857142857). SV: 65/263 (0.24714828897338403). Mean e: 0.0109. Median e: 0.0013. Std e: 0.1025. 
6 - Kernels: 455/455 (1.0). SV: 68/264 (0.25757575757575757). Mean e: 0.0046. Median e: -0.0119. Std e: 0.1027. 
7 - Kernels: 353/455 (0.7758241758241758). SV: 65/264 (0.24621212121212122). Mean e: -0.0014. Median e: -0.0132. Std e: 0.0959. 
8 - Kernels: 3/455 (0



({'fit_time': array([12.3501, 13.5023, 14.8478, 12.6714, 12.3837, 12.6628, 13.1383, 13.1392, 13.1558, 12.6334,
         12.85  , 12.9905, 13.2905, 12.9656, 12.588 , 12.8194, 12.9544, 13.0225, 12.8811, 12.6993]),
  'score_time': array([1.6879, 1.6964, 1.6409, 1.6426, 1.6407, 1.6976, 1.6506, 1.6805, 1.6671, 1.6706, 1.6679,
         1.666 , 1.6697, 1.7762, 1.6706, 1.7252, 1.6722, 1.6559, 1.6711, 1.664 ]),
  'test_score': array([0.8315, 0.8636, 0.9425, 0.7931, 0.8989, 0.9205, 0.8506, 0.7586, 0.8315, 0.8409, 0.8736,
         0.8276, 0.8315, 0.8523, 0.8851, 0.8736, 0.8652, 0.8636, 0.7701, 0.8391]),
  'train_score': array([0.9771, 1.    , 0.9773, 0.9962, 0.9924, 0.9924, 0.9962, 0.9848, 0.9885, 1.    , 0.9848,
         0.9886, 0.9924, 0.9848, 0.9848, 0.9962, 0.9924, 0.9924, 1.    , 0.9924])},
 [{'elapsed_time': 11.33696985244751,
   'nr_kernels_used': 455,
   'nr_sv_used': 81,
   'total_kernels': 455,
   'total_sv': 262},
  {'elapsed_time': 12.456539392471313,
   'nr_kernels_used': 115,
   'nr

In [13]:
ssparse_times = np.array([s['elapsed_time'] for s in ssparse_stats])
ssparse_kernels = np.array([s['nr_kernels_used'] for s in ssparse_stats])
ssparse_sv = np.array([s['nr_sv_used'] for s in ssparse_stats])
print(
    f"Score: {ssparse_cv_results['test_score'].mean()} +- {ssparse_cv_results['test_score'].std()}\n"
    f"Time: {ssparse_times.mean()} +- {ssparse_times.std()}\n"
    f"Kernels: {ssparse_kernels.mean()} +- {ssparse_kernels.std()}\n"
    f"SVs: {ssparse_sv.mean()} +- {ssparse_sv.std()}"
)

Score: 0.8506564580324751 +- 0.04390731667720741
Time: 11.929100894927979 +- 0.5092721980526712
Kernels: 190.5 +- 168.30552575598938
SVs: 70.1 +- 8.642337646724988


In [14]:
import json
with open('ionosphere_results.json', 'w') as fp:
    json.dump(
        {
            'ksparse': {
                'scores': list(ksparse_cv_results['test_score']),
                'times': list(ksparse_times),
                'kernels': [int(k) for k in ksparse_kernels],
                'svs': [int(s) for s in ksparse_sv],
            },
            'ssparse': {
                'scores': list(ssparse_cv_results['test_score']),
                'times': list(ssparse_times),
                'kernels': [int(k) for k in ssparse_kernels],
                'svs': [int(s) for s in ssparse_sv],
            },
            'base': {
                'scores': list(base_cv_results['test_score']),
                'times': list(base_times),
                'kernels': [int(k) for k in base_kernels],
                'svs': [int(s) for s in base_sv],
            },
            'total_kernels': len(kernels),
            'total_sv': len(X_train),
        },
        fp,
        indent=4,
        sort_keys=True
    )