In [1]:
%load_ext autoreload
%autoreload 2
%pylab inline

from copy import deepcopy
import itertools
import pickle

import pandas as pd
from scipy.io import savemat, loadmat
import seaborn as sns
from sklearn.metrics import (log_loss, mean_squared_error, roc_curve, auc,
                             precision_recall_fscore_support, confusion_matrix)
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate
from tqdm import tqdm, trange, tqdm_notebook as tqdmn

from BEKML import BEMKL, plot_distplot
from utils import poly_kernel, gauss_kernel, scoring, plot_kernel_importances, plot_compare_models

sns.set(style='ticks', context='talk')
np.set_printoptions(precision=4, linewidth=100)

Populating the interactive namespace from numpy and matplotlib


  return f(*args, **kwds)


In [2]:
dtypes = {i: float for i in range(35)}
dtypes[35] = str
data = pd.read_csv('data/ionosphere.csv', names=list(range(35)))#, dtype=dtypes)
data.loc[data.loc[:, 34] == 'g', 34] = 1
data.loc[data.loc[:, 34] == 'b', 34] = -1
data.loc[:, 34] = data.loc[:, 34].astype(int)
display(data.head())
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,1
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,-1
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,1
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,-1
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,1


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
count,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
mean,0.891738,0.0,0.641342,0.044372,0.601068,0.115889,0.550095,0.11936,0.511848,0.181345,...,-0.071187,0.541641,-0.069538,0.378445,-0.027907,0.352514,-0.003794,0.349364,0.01448,0.282051
std,0.311155,0.0,0.497708,0.441435,0.519862,0.46081,0.492654,0.52075,0.507066,0.483851,...,0.508495,0.516205,0.550025,0.575886,0.507974,0.571483,0.513574,0.522663,0.468337,0.960769
min,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,1.0,0.0,0.472135,-0.064735,0.41266,-0.024795,0.21131,-0.05484,0.08711,-0.048075,...,-0.33239,0.286435,-0.443165,0.0,-0.236885,0.0,-0.242595,0.0,-0.16535,-1.0
50%,1.0,0.0,0.87111,0.01631,0.8092,0.0228,0.72873,0.01471,0.68421,0.01829,...,-0.01505,0.70824,-0.01769,0.49664,0.0,0.44277,0.0,0.40956,0.0,1.0
75%,1.0,0.0,1.0,0.194185,1.0,0.334655,0.96924,0.445675,0.95324,0.534195,...,0.156765,0.999945,0.153535,0.883465,0.154075,0.85762,0.20012,0.813765,0.17166,1.0
max,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
X = data.iloc[:, :-1].copy()
y = data.iloc[:, -1].copy()
N, D = X.shape
display(X.shape, y.shape, y.mean())

(351, 34)

(351,)

0.28205128205128205

In [6]:
proc_data = data.copy()
proc_data.iloc[:, -1] = y
proc_data.to_csv('data/proc_ionosphere.csv')

In [4]:
rbf_init, rbf_end = -3, 7
ply_init, ply_end = 1, 4

kernel_attrs = [('rbf', 'all', i) for i in range(rbf_init, rbf_end)]
kernels = [lambda A, B: gauss_kernel(A, B, 2**i)
           for i in range(rbf_init, rbf_end)]

kernel_attrs += [('poly', 'all', i) for i in range(ply_init, ply_end)]
kernels += [lambda A, B: poly_kernel(A, B, 1, i)
            for i in range(ply_init, ply_end)]

kernel_attrs += [('rbf', j, i) for i in range(rbf_init, rbf_end)
                 for j in range(D)]
kernels += [lambda A, B: gauss_kernel(A[:, j:j+1], B[:, j:j+1], 2**i)
            for i in range(rbf_init, rbf_end) for j in range(D)]

kernel_attrs += [('poly', j, i) for i in range(ply_init, ply_end)
                 for j in range(D)]
kernels += [lambda A, B: poly_kernel(A[:, j:j+1], B[:, j:j+1], 1, i)
            for i in range(ply_init, ply_end) for j in range(D)]
len(kernels), len(kernel_attrs)

(455, 455)

In [5]:
X_train, X_test, y_train, y_test =\
        train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

In [6]:
normalizer = Normalizer()
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.fit_transform(X_test)

# Baseline Model

In [7]:
max_iter = 200
base_model = BEMKL(kernels=kernels, hyp_lambda_alpha=1, hyp_lambda_beta=1,
                   hyp_gamma_alpha=1, hyp_gamma_beta=1,
                   hyp_omega_alpha=1, hyp_omega_beta=1,
                   e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                   filter_kernels=False, filter_sv=False, verbose=False,
                   max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
base_model = make_pipeline(Normalizer(), base_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
base_cv_results = cross_validate(base_model, X, y, cv=folds, scoring=scoring)
base_stats = deepcopy(scoring.stats)
base_cv_results, base_stats

0 - Kernels: 353/455 (0.7758241758241758). SV: 247/262 (0.9427480916030534). Mean e: -0.0099. Median e: -0.0190. Std e: 0.1232. 
1 - Kernels: 353/455 (0.7758241758241758). SV: 254/263 (0.9657794676806084). Mean e: -0.0037. Median e: -0.0155. Std e: 0.1129. 
2 - Kernels: 455/455 (1.0). SV: 260/264 (0.9848484848484849). Mean e: -0.0074. Median e: -0.0231. Std e: 0.1029. 
3 - Kernels: 455/455 (1.0). SV: 261/264 (0.9886363636363636). Mean e: 0.0009. Median e: -0.0234. Std e: 0.1150. 
4 - Kernels: 455/455 (1.0). SV: 257/262 (0.9809160305343512). Mean e: -0.0032. Median e: -0.0199. Std e: 0.0996. 
5 - Kernels: 455/455 (1.0). SV: 257/263 (0.9771863117870723). Mean e: 0.0010. Median e: -0.0233. Std e: 0.1232. 
6 - Kernels: 353/455 (0.7758241758241758). SV: 260/264 (0.9848484848484849). Mean e: -0.0097. Median e: -0.0214. Std e: 0.1095. 
7 - Kernels: 455/455 (1.0). SV: 260/264 (0.9848484848484849). Mean e: 0.0000. Median e: -0.0241. Std e: 0.1183. 
8 - Kernels: 353/455 (0.7758241758241758). SV:



({'fit_time': array([19.0565, 13.0283, 12.8575, 12.6382, 12.5832, 12.7646, 12.8212, 13.0716, 12.791 , 12.6556,
         13.1319, 13.3079, 12.518 , 12.529 , 12.4953, 13.7928, 12.4653, 12.5789, 12.685 , 12.5692]),
  'score_time': array([2.0993, 1.6388, 1.6471, 1.6336, 1.6465, 1.6377, 1.6868, 1.641 , 1.6353, 1.6404, 1.6911,
         1.6216, 1.6342, 1.6639, 1.6407, 1.6313, 1.6166, 1.629 , 1.6144, 1.6345]),
  'test_score': array([0.8764, 0.8977, 0.8276, 0.8851, 0.9438, 0.8182, 0.8851, 0.8506, 0.9438, 0.9205, 0.8276,
         0.8276, 0.8539, 0.875 , 0.8506, 0.9195, 0.9101, 0.9318, 0.8621, 0.7816]),
  'train_score': array([0.9962, 0.9772, 0.9659, 0.9811, 0.9695, 0.981 , 0.9848, 0.9811, 0.9656, 0.9886, 0.9924,
         0.9773, 0.9771, 0.9734, 0.9735, 0.9886, 0.9733, 0.9886, 0.9886, 0.9886])},
 [{'elapsed_time': 17.876033544540405,
   'nr_kernels_used': 353,
   'nr_sv_used': 247,
   'total_kernels': 455,
   'total_sv': 262},
  {'elapsed_time': 11.930845022201538,
   'nr_kernels_used': 353,
   '

In [8]:
base_times = np.array([s['elapsed_time'] for s in base_stats])
base_kernels = np.array([s['nr_kernels_used'] for s in base_stats])
base_sv = np.array([s['nr_sv_used'] for s in base_stats])
print(
    f"Score: {base_cv_results['test_score'].mean()} +- {base_cv_results['test_score'].std()}\n"
    f"Time: {base_times.mean()} +- {base_times.std()}\n"
    f"Kernels: {base_kernels.mean()} +- {base_kernels.std()}\n"
    f"SVs: {base_sv.mean()} +- {base_sv.std()}"
)

Score: 0.8744256542566307 +- 0.044269840764805356
Time: 12.056291925907136 +- 1.3728212811823715
Kernels: 434.6 +- 40.8
SVs: 255.35 +- 3.79835490706174


# Kernel-sparse model

In [9]:
max_iter = 200
ksparse_model = BEMKL(kernels=kernels, hyp_lambda_alpha=1, hyp_lambda_beta=1,
                      hyp_gamma_alpha=1, hyp_gamma_beta=1,
                      hyp_omega_alpha=1e-11, hyp_omega_beta=1e9,
                      e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                      filter_kernels=False, filter_sv=False, verbose=False,
                      max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
ksparse_pipeline = make_pipeline(Normalizer(), ksparse_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
ksparse_cv_results = cross_validate(ksparse_pipeline, X, y, cv=folds, scoring=scoring)
ksparse_stats = deepcopy(scoring.stats)
ksparse_cv_results, ksparse_stats

0 - Kernels: 3/455 (0.006593406593406593). SV: 249/262 (0.950381679389313). Mean e: 0.0090. Median e: -0.0002. Std e: 0.1108. 
1 - Kernels: 3/455 (0.006593406593406593). SV: 255/263 (0.9695817490494296). Mean e: 0.0086. Median e: -0.0001. Std e: 0.1031. 
2 - Kernels: 3/455 (0.006593406593406593). SV: 255/264 (0.9659090909090909). Mean e: 0.0098. Median e: -0.0002. Std e: 0.1064. 
3 - Kernels: 3/455 (0.006593406593406593). SV: 263/264 (0.9962121212121212). Mean e: 0.0083. Median e: -0.0002. Std e: 0.1021. 
4 - Kernels: 3/455 (0.006593406593406593). SV: 259/262 (0.9885496183206107). Mean e: 0.0091. Median e: -0.0002. Std e: 0.1124. 
5 - Kernels: 3/455 (0.006593406593406593). SV: 262/263 (0.9961977186311787). Mean e: 0.0095. Median e: -0.0002. Std e: 0.1170. 
6 - Kernels: 3/455 (0.006593406593406593). SV: 258/264 (0.9772727272727273). Mean e: 0.0085. Median e: -0.0002. Std e: 0.1061. 
7 - Kernels: 105/455 (0.23076923076923078). SV: 251/264 (0.9507575757575758). Mean e: 0.0116. Median e: -



({'fit_time': array([12.8814, 12.5282, 12.7707, 12.5508, 12.1169, 12.5121, 12.346 , 12.3196, 12.7863, 12.6238,
         12.6619, 12.6043, 12.4087, 12.996 , 12.5492, 12.5955, 12.5875, 12.6918, 12.6889, 12.7538]),
  'score_time': array([1.6292, 1.6319, 1.6571, 1.6448, 1.6348, 1.6369, 1.6403, 1.636 , 1.6721, 1.6476, 1.6315,
         1.7303, 1.6347, 1.6636, 1.6771, 1.6495, 1.6671, 1.6648, 1.6185, 1.7122]),
  'test_score': array([0.8539, 0.9091, 0.908 , 0.931 , 0.9101, 0.8636, 0.8621, 0.8161, 0.8652, 0.9205, 0.8736,
         0.8506, 0.8989, 0.8295, 0.8391, 0.8966, 0.8876, 0.8636, 0.8736, 0.8966]),
  'train_score': array([0.9962, 0.9734, 0.9735, 0.9735, 0.9733, 0.9772, 0.9811, 0.9848, 0.9885, 0.9734, 0.9659,
         0.9924, 0.9695, 0.981 , 0.9924, 0.9659, 0.9771, 0.9886, 0.9773, 0.9697])},
 [{'elapsed_time': 11.85596776008606,
   'nr_kernels_used': 3,
   'nr_sv_used': 249,
   'total_kernels': 455,
   'total_sv': 262},
  {'elapsed_time': 11.50810718536377,
   'nr_kernels_used': 3,
   'nr_sv_

In [10]:
ksparse_times = np.array([s['elapsed_time'] for s in ksparse_stats])
ksparse_kernels = np.array([s['nr_kernels_used'] for s in ksparse_stats])
ksparse_sv = np.array([s['nr_sv_used'] for s in ksparse_stats])
print(
    f"Score: {ksparse_cv_results['test_score'].mean()} +- {ksparse_cv_results['test_score'].std()}\n"
    f"Time: {ksparse_times.mean()} +- {ksparse_times.std()}\n"
    f"Kernels: {ksparse_kernels.mean()} +- {ksparse_kernels.std()}\n"
    f"SVs: {ksparse_sv.mean()} +- {ksparse_sv.std()}"
)

Score: 0.8774610205111948 +- 0.030515527451685762
Time: 11.55106861591339 +- 0.19486447788724603
Kernels: 8.1 +- 22.23038461205744
SVs: 254.9 +- 4.31161222746202


# SV-sparse model

In [11]:
max_iter = 200
ssparse_model = BEMKL(kernels=kernels,
                      hyp_lambda_alpha=1e-11, hyp_lambda_beta=1e9,
                      hyp_gamma_alpha=1, hyp_gamma_beta=1,
                      hyp_omega_alpha=1, hyp_omega_beta=1,
                      e_null_thrsh=1e-2, a_null_thrsh=1e-2,
                      filter_kernels=False, filter_sv=False, verbose=False,
                      max_iter=max_iter, hyperopt_enabled=False, calculate_bounds=False)
ssparse_pipeline = make_pipeline(Normalizer(), ssparse_model)

scoring.iteration = 0
scoring.stats = []
folds = RepeatedStratifiedKFold(n_splits=4, n_repeats=5)
ssparse_cv_results = cross_validate(ssparse_pipeline, X, y, cv=folds, scoring=scoring)
ssparse_stats = deepcopy(scoring.stats)
ssparse_cv_results, ssparse_stats

0 - Kernels: 115/455 (0.25274725274725274). SV: 79/262 (0.3015267175572519). Mean e: 0.0143. Median e: -0.0088. Std e: 0.1016. 
1 - Kernels: 3/455 (0.006593406593406593). SV: 74/263 (0.2813688212927757). Mean e: 0.0050. Median e: -0.0072. Std e: 0.1075. 
2 - Kernels: 13/455 (0.02857142857142857). SV: 87/264 (0.32954545454545453). Mean e: 0.0047. Median e: -0.0041. Std e: 0.0934. 
3 - Kernels: 115/455 (0.25274725274725274). SV: 83/264 (0.3143939393939394). Mean e: -0.0012. Median e: -0.0082. Std e: 0.1016. 
4 - Kernels: 353/455 (0.7758241758241758). SV: 67/262 (0.25572519083969464). Mean e: -0.0007. Median e: -0.0113. Std e: 0.1069. 
5 - Kernels: 455/455 (1.0). SV: 90/263 (0.34220532319391633). Mean e: 0.0050. Median e: -0.0160. Std e: 0.0956. 
6 - Kernels: 13/455 (0.02857142857142857). SV: 69/264 (0.26136363636363635). Mean e: 0.0046. Median e: -0.0046. Std e: 0.1046. 
7 - Kernels: 13/455 (0.02857142857142857). SV: 91/264 (0.3446969696969697). Mean e: 0.0107. Median e: 0.0033. Std e: 0



({'fit_time': array([12.5261, 12.485 , 12.5159, 12.5742, 12.2896, 12.4813, 12.9318, 12.8103, 12.5811, 12.5133,
         12.4392, 12.7345, 12.4365, 12.5706, 12.6374, 13.0286, 12.492 , 12.5688, 12.4701, 12.9897]),
  'score_time': array([1.6526, 1.6405, 1.6177, 1.6451, 1.6385, 1.7552, 1.6339, 1.6521, 1.6433, 1.641 , 1.6362,
         1.6445, 1.6403, 1.6368, 1.6641, 1.6808, 1.657 , 1.6344, 1.6159, 1.6467]),
  'test_score': array([0.8315, 0.8295, 0.8851, 0.8506, 0.7753, 0.875 , 0.8276, 0.8736, 0.8315, 0.8409, 0.8621,
         0.8046, 0.8876, 0.8409, 0.8851, 0.8506, 0.8539, 0.9091, 0.8966, 0.7816]),
  'train_score': array([0.9924, 1.    , 0.9886, 0.9962, 0.9885, 0.9848, 0.9962, 0.9886, 0.9885, 0.9962, 0.9886,
         0.9848, 0.9924, 0.962 , 0.9773, 0.9886, 0.9924, 0.9924, 0.9886, 0.9886])},
 [{'elapsed_time': 11.487034559249878,
   'nr_kernels_used': 115,
   'nr_sv_used': 79,
   'total_kernels': 455,
   'total_sv': 262},
  {'elapsed_time': 11.433372735977173,
   'nr_kernels_used': 3,
   'nr_

In [12]:
ssparse_times = np.array([s['elapsed_time'] for s in ssparse_stats])
ssparse_kernels = np.array([s['nr_kernels_used'] for s in ssparse_stats])
ssparse_sv = np.array([s['nr_sv_used'] for s in ssparse_stats])
print(
    f"Score: {ssparse_cv_results['test_score'].mean()} +- {ssparse_cv_results['test_score'].std()}\n"
    f"Time: {ssparse_times.mean()} +- {ssparse_times.std()}\n"
    f"Kernels: {ssparse_kernels.mean()} +- {ssparse_kernels.std()}\n"
    f"SVs: {ssparse_sv.mean()} +- {ssparse_sv.std()}"
)

Score: 0.8496235602831884 +- 0.03533264454769601
Time: 11.562711155414581 +- 0.1900325688305627
Kernels: 191.7 +- 187.2103362531033
SVs: 75.6 +- 9.723168207945392


In [13]:
import json
with open('ionosphere_results.json', 'w') as fp:
    json.dump(
        {
            'ksparse': {
                'scores': list(ksparse_cv_results['test_score']),
                'times': list(ksparse_times),
                'kernels': [int(k) for k in ksparse_kernels],
                'svs': [int(s) for s in ksparse_sv],
            },
            'ssparse': {
                'scores': list(ssparse_cv_results['test_score']),
                'times': list(ssparse_times),
                'kernels': [int(k) for k in ksparse_kernels],
                'svs': [int(s) for s in ssparse_sv],
            },
            'base': {
                'scores': list(base_cv_results['test_score']),
                'times': list(base_times),
                'kernels': [int(k) for k in base_kernels],
                'svs': [int(s) for s in base_sv],
            },
            'total_kernels': len(kernels),
            'total_sv': len(X_train),
        },
        fp,
        indent=4,
        sort_keys=True
    )