# Studying SVM on different datasets

 imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import svmlab
from matplotlib import colors
#from sklearn.datasets import make_moons, make_circles
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import itertools as it
import os

auxiliary functions

In [2]:
def mkdirs_that_not_exist(dir_list):
    for directory in dir_list:
        if not os.path.exists(directory):
            os.makedirs(directory)

def make_n_blobs(n_blobs, n_samples, cluster_std=0.1, standard_scale=True):
    assert n_blobs in [1, 2, 3, 4]
    if n_blobs == 1:
        centers = [[0, 0]]
    elif n_blobs == 2:
        centers = [[1, 1], [-1, -1]]
    elif n_blobs == 3:
        centers = [[1, 1], [-1, -1], [1, -1]]
    elif n_blobs == 4:
        centers = [[1, 1], [-1, -1], [1, -1], [-1, 1]]
        
    x_data, y_data = make_blobs(n_samples=n_samples, centers=centers, cluster_std=cluster_std, random_state=0)
    x_data = StandardScaler().fit_transform(x_data)
    
    return x_data, y_data

develop base directory structure

In [3]:
path_to_base_dir = ''
study_dir = path_to_base_dir + 'study/'
mkdirs_that_not_exist([study_dir])

blobs_dir = study_dir + 'blobs/'
moons_dir = study_dir + 'moons/'
circles_dir = study_dir + 'circles/'
mkdirs_that_not_exist([blobs_dir, moons_dir, circles_dir])

necessary ranges and lists

In [4]:
svc_aliases = ['svc', 'SVC', 'c_svc']
nusvc_aliases = ['nusvc', 'NuSVC', 'nu_svc']

kernels_with_gamma = ['rbf', 'poly', 'sigmoid']

n_blobs_range = [2, 3, 4]
n_samples_range = [10, 50, 250]
cluster_std_range = [0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6]

svm_impls = ['svc', 'nusvc']
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

# C, nu, and gamma static ranges
static_C_range = [1.0 for _ in range(50)]
static_nu_range = [0.5 for _ in range(50)]
static_gamma_range = [1.0 for _ in range(50)]

# C, nu, and gamma dynamic ranges
dynamic_C_range = np.logspace(-3, 6, 50)
dynamic_nu_range = np.linspace(0.01, 0.9, 50)
dynamic_gamma_range = np.logspace(-6, 3, 50)

# grid search ranges
grid_C_range = np.logspace(-3, 9, 13)
grid_nu_range = np.linspace(0.01, 0.9, 13)
grid_gamma_range = np.logspace(-9, 3, 13)

# blobs

sub-directories

In [5]:
n_samples_dir = blobs_dir + 'n_samples/'
cluster_std_dir = blobs_dir + 'cluster_std/'
animations_dir = blobs_dir + 'animations/'
mkdirs_that_not_exist([n_samples_dir, cluster_std_dir])

#### Dynamic number of samples

In [8]:
for n_blobs, svm_impl, kernel, n_samples in list(it.product(n_blobs_range, svm_impls, ['rbf'], n_samples_range)):
    print('== BLOBS: %d' % n_blobs) 
    print('-- Grid searching ' + svm_impl + ' ' + kernel + ' kernel on ' + str(n_samples) + '-sample blobs')
    
    # different directory for each number of blobs
    n_blobs_dir = n_samples_dir + str(n_blobs) + '_blobs/'
    mkdirs_that_not_exist([n_blobs_dir])
    
    # make blobs dataset
    x_data, y_data = make_n_blobs(n_blobs, (n_blobs * n_samples), cluster_std=0.5)

    # initialize svmlab with the blobs dataset
    lab = svmlab.SVMLab(x_data, y_data)

    # set up dict arguments
    svm_=dict(impl=svm_impl, kernel=kernel, dfs='ovr')
    if svm_impl in svc_aliases:
        range_=dict(C=grid_C_range, gamma=grid_gamma_range)
    elif svm_impl in nusvc_aliases:
        range_=dict(nu=grid_nu_range, gamma=grid_gamma_range)
    else:
        assert False
    
    # find optimal parameters 
    heatmap_fn = n_blobs_dir + svm_impl + '_' + kernel + '_' + str(n_samples) + '_samples_search.png'
    best_params, best_score = lab.optimal_param_grid_search(
        svm_=svm_,
        range_=range_,
#         heatmap_=dict(filename=heatmap_fn, norm=None)
        heatmap_=dict(filename=heatmap_fn, norm=colors.PowerNorm(gamma=10.))
    )  
    print('   ... saved grid search heatmap to file: %s' % heatmap_fn)
    print(' * the best parameters are %s with a score of %0.2f' % (best_params, best_score))

    if svm_impl in svc_aliases:
        svm_['C'] = best_params['C']
    elif svm_impl in nusvc_aliases:
        svm_['nu'] = best_params['nu']
    else:
        assert False   
    svm_['gamma'] = best_params['gamma']
    
    # plot svm with optimal parameters 
    plot_fn = n_blobs_dir + svm_impl + '_' + kernel + '_' + str(n_samples) + '_samples_optimal_plot.png'
    lab.svm_plot(
        svm_=svm_,
        plot_=dict(filename=plot_fn)
    )
    print('   ... saved optimal %s plot to file: %s' % (svm_impl, plot_fn))
    
    plt.close('all')

== BLOBS: 2
-- Grid searching svc rbf kernel on 10-sample blobs
   ... saved grid search heatmap to file: study/blobs/n_samples/2_blobs/svc_rbf_10_samples_search.png
 * the best parameters are {'C': 0.001, 'gamma': 1.0000000000000001e-09} with a score of 1.00
   ... saved optimal svc plot to file: study/blobs/n_samples/2_blobs/svc_rbf_10_samples_optimal_plot.png
== BLOBS: 2
-- Grid searching svc rbf kernel on 50-sample blobs
   ... saved grid search heatmap to file: study/blobs/n_samples/2_blobs/svc_rbf_50_samples_search.png
 * the best parameters are {'C': 0.001, 'gamma': 1.0000000000000001e-09} with a score of 1.00
   ... saved optimal svc plot to file: study/blobs/n_samples/2_blobs/svc_rbf_50_samples_optimal_plot.png
== BLOBS: 2
-- Grid searching svc rbf kernel on 250-sample blobs
   ... saved grid search heatmap to file: study/blobs/n_samples/2_blobs/svc_rbf_250_samples_search.png
 * the best parameters are {'C': 0.001, 'gamma': 1.0000000000000001e-09} with a score of 1.00
   ... s

#### Dynamic cluster standard deviation

In [7]:
# only doing n_blobs == [2] and kernels == ['rbf'] due to time constraint
for n_blobs, svm_impl, kernel, cluster_std in list(it.product([2], svm_impls, ['rbf'], cluster_std_range)):
    print('== BLOBS: %d' % n_blobs) 
    print('-- Grid searching ' + svm_impl + ' ' + kernel + ' kernel on ' + str(cluster_std) + '-cluster-std blobs')
    
    # different directory for each number of blobs
    n_blobs_dir = cluster_std_dir + str(n_blobs) + '_blobs/'
    mkdirs_that_not_exist([n_blobs_dir])
    
    # make blobs dataset
    x_data, y_data = make_n_blobs(n_blobs, n_blobs * 100, cluster_std=cluster_std)
    
    # split into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, stratify=y_data, random_state=42)

    # initialize svmlab with the blobs dataset
    lab = svmlab.SVMLab(x_train, y_train)

    # set up dict arguments
    svm_=dict(impl=svm_impl, kernel=kernel, dfs='ovr')
    if svm_impl in svc_aliases:
        range_=dict(C=grid_C_range, gamma=grid_gamma_range)
    elif svm_impl in nusvc_aliases:
        range_=dict(nu=grid_nu_range, gamma=grid_gamma_range)
    else:
        assert False
    
    # find optimal parameters 
    heatmap_fn = n_blobs_dir + svm_impl + '_' + kernel + '_' + str(cluster_std) + '_cstd_search.png'
    best_params, best_score = lab.optimal_param_grid_search(
        svm_=svm_,
        range_=range_,
        heatmap_=dict(filename=heatmap_fn, norm=None)
#         heatmap_=dict(filename=heatmap_fn, norm=colors.PowerNorm(gamma=10.))
    )  
    print('   ... saved grid search heatmap to file: %s' % heatmap_fn)
    print(' * the best parameters are %s with a score of %0.2f' % (best_params, best_score))
    
    if svm_impl in svc_aliases:
        svm_['C'] = best_params['C']
    elif svm_impl in nusvc_aliases:
        svm_['nu'] = best_params['nu']
    else:
        assert False   
    svm_['gamma'] = best_params['gamma']
    
    # test svm fit with optimal parameters on new blobs dataset
    print('-- Testing optimal fit against test blobs')
    score, n_supports = lab.svm_test(x_test, y_test, svm_=svm_)
    print(' * score: %0.2f \t number of supports: %d' % (score, n_supports))

    # plot svm with optimal parameters 
    plot_fn = n_blobs_dir + svm_impl + '_' + kernel + '_' + str(cluster_std) + '_cstd_optimal_plot.png'
    lab.svm_plot(
        svm_=svm_,
        plot_=dict(filename=plot_fn)
    )
    print('   ... saved optimal %s plot to file: %s' % (svm_impl, plot_fn))
    
    plt.close('all')

== BLOBS: 2
-- Grid searching svc rbf kernel on 0.4-cluster-std blobs
   ... saved grid search heatmap to file: study/blobs/cluster_std/2_blobs/svc_rbf_0.4_cstd_search.png
 * the best parameters are {'C': 0.001, 'gamma': 1.0000000000000001e-09} with a score of 1.00
-- Testing optimal fit against test blobs
 * score: 1.00 	 number of supports: 160
   ... saved optimal svc plot to file: study/blobs/cluster_std/2_blobs/svc_rbf_0.4_cstd_optimal_plot.png
== BLOBS: 2
-- Grid searching svc rbf kernel on 0.6-cluster-std blobs
   ... saved grid search heatmap to file: study/blobs/cluster_std/2_blobs/svc_rbf_0.6_cstd_search.png
 * the best parameters are {'C': 0.001, 'gamma': 1.0000000000000001e-09} with a score of 1.00
-- Testing optimal fit against test blobs
 * score: 1.00 	 number of supports: 160
   ... saved optimal svc plot to file: study/blobs/cluster_std/2_blobs/svc_rbf_0.6_cstd_optimal_plot.png
== BLOBS: 2
-- Grid searching svc rbf kernel on 0.8-cluster-std blobs
   ... saved grid sear

#### Animations

In [8]:
# interval between frames (for animation)
interval = 0.3 #in seconds

for n_blobs, svm_impl, kernel in list(it.product(n_blobs_range, svm_impls, kernels)):
    print('== BLOBS: %d' % n_blobs)     
    # different directory for each number of blobs
    n_blobs_dir = animations_dir + str(n_blobs) + '_blobs/'
    mkdirs_that_not_exist([n_blobs_dir])
    
    # make blobs dataset
    x_data, y_data = make_n_blobs(n_blobs, n_blobs * 100, cluster_std=0.6)

    # initialize svmlab with the blobs dataset
    lab = svmlab.SVMLab(x_data, y_data)

    # set up dict arguments
    svm_=dict(impl=svm_impl, kernel=kernel, dfs='ovr')
    ranges_ = []
    if svm_impl in svc_aliases:
        ranges_.append(dict(C=dynamic_C_range, gamma=static_gamma_range))
        if kernel in kernels_with_gamma:
            ranges_.append(dict(C=static_C_range, gamma=dynamic_gamma_range))
        partial_fns = ['_dynamic_C', '_dynamic_gamma']
    elif svm_impl in nusvc_aliases:
        ranges_.append(dict(nu=dynamic_nu_range, gamma=static_gamma_range))
        if kernel in kernels_with_gamma:
            ranges_.append(dict(nu=static_nu_range, gamma=dynamic_gamma_range))
        partial_fns = ['_dynamic_nu', '_dynamic_gamma']
    else:
        assert False
    
    # make animation
    print('-- Animating ' + svm_impl + ', ' + kernel + ' kernel')
    for partial_fn, range_ in zip(partial_fns, ranges_):
        animation_fn = n_blobs_dir + svm_impl + '_' + kernel + partial_fn + '.mp4'
        lab.svm_animation(
            svm_=svm_,
            range_=range_,
            animation_=dict(filename=animation_fn, interval=interval)
        )
        print('   ... saved %s %s %s animation to file: %s' % (svm_impl, kernel, partial_fn, animation_fn))

    plt.close('all')

== BLOBS: 2
-- Animating svc, linear kernel
   ... saved svc linear _dynamic_C animation to file: study/blobs/animations/2_blobs/svc_linear_dynamic_C.mp4
== BLOBS: 2
-- Animating svc, poly kernel
   ... saved svc poly _dynamic_C animation to file: study/blobs/animations/2_blobs/svc_poly_dynamic_C.mp4
   ... saved svc poly _dynamic_gamma animation to file: study/blobs/animations/2_blobs/svc_poly_dynamic_gamma.mp4
== BLOBS: 2
-- Animating svc, rbf kernel
   ... saved svc rbf _dynamic_C animation to file: study/blobs/animations/2_blobs/svc_rbf_dynamic_C.mp4
   ... saved svc rbf _dynamic_gamma animation to file: study/blobs/animations/2_blobs/svc_rbf_dynamic_gamma.mp4
== BLOBS: 2
-- Animating svc, sigmoid kernel
   ... saved svc sigmoid _dynamic_C animation to file: study/blobs/animations/2_blobs/svc_sigmoid_dynamic_C.mp4
   ... saved svc sigmoid _dynamic_gamma animation to file: study/blobs/animations/2_blobs/svc_sigmoid_dynamic_gamma.mp4
== BLOBS: 2
-- Animating nusvc, linear kernel
   .

## miscellaneous

In [26]:
misc_dir = study_dir + 'misc/'

4 blobs, NuSVC RBF... handpicking "optimized" nu and gamma because best_params from GridSearchCV seemed very wrong

In [17]:
blobs_4_40_x, blobs_4_40_y = make_n_blobs(4, 40, cluster_std=0.5)
x_train, x_test, y_train, y_test = train_test_split(blobs_4_40_x, blobs_4_40_y, test_size=0.2, stratify=blobs_4_40_y, random_state=42)
blobs_4_40_fn = misc_dir + 'blobs_4_40.png'
lab = svmlab.SVMLab(x_train, y_train)
svm_=dict(impl='nusvc', kernel='rbf', dfs='ovr', nu=.53, gamma=10.)
lab.svm_plot(svm_=svm_, plot_=dict(filename=blobs_4_40_fn))
score, n_supports = lab.svm_test(x_test, y_test, svm_=svm_)
print(score, n_supports)

blobs_4_200_x, blobs_4_200_y = make_n_blobs(4, 200, cluster_std=0.5)
x_train, x_test, y_train, y_test = train_test_split(blobs_4_200_x, blobs_4_200_y, test_size=0.2, stratify=blobs_4_200_y, random_state=42)
blobs_4_200_fn = misc_dir + 'blobs_4_200.png'
lab = svmlab.SVMLab(x_train, y_train)
svm_=dict(impl='nusvc', kernel='rbf', dfs='ovr', nu=.53, gamma=1.0e-08)
lab.svm_plot(svm_=svm_, plot_=dict(filename=blobs_4_200_fn))
score, n_supports = lab.svm_test(x_test, y_test, svm_=svm_)
print(score, n_supports)

blobs_4_1000_x, blobs_4_1000_y = make_n_blobs(4, 1000, cluster_std=0.5)
x_train, x_test, y_train, y_test = train_test_split(blobs_4_1000_x, blobs_4_1000_y, test_size=0.2, stratify=blobs_4_1000_y, random_state=42)
blobs_4_1000_fn = misc_dir + 'blobs_4_1000.png'
lab = svmlab.SVMLab(x_train, y_train)
svm_=dict(impl='nusvc', kernel='rbf', dfs='ovr', nu=.38, gamma=.001)
lab.svm_plot(svm_=svm_, plot_=dict(filename=blobs_4_1000_fn))
score, n_supports = lab.svm_test(x_test, y_test, svm_=svm_)
print(score, n_supports)

plt.close('all')

1.0 29
0.925 88
0.965 489


3 blobs, NuSVC Poly kernel... creating animation with custom range to better analyze behavior

In [24]:
blobs_3_300_x, blobs_3_300_y = make_n_blobs(3, 300, cluster_std=0.6)
lab = svmlab.SVMLab(blobs_3_300_x, blobs_3_300_y)

custom_gamma_range = np.linspace(0.005, 0.015, 50)
animation_fn = misc_dir + 'nusvc_poly_gamma_custom.mp4'
lab.svm_animation(
    svm_=dict(impl='nusvc', kernel='poly', dfs='ovr'),
    range_=dict(nu=static_nu_range, gamma=custom_gamma_range),
    animation_=dict(filename=animation_fn, interval=0.3)
)

plt.close('all')

make sample blobs before svm

In [25]:
for n_blobs in [2, 3, 4]:
    x_data, y_data = make_n_blobs(n_blobs, n_blobs * 200, cluster_std=0.6)
    lab = svmlab.SVMLab(x_data, y_data)
    plt.scatter(x_data[:,0], x_data[:,1], c=y_data)
    plt.savefig('%s%d_blobs.png' % (misc_dir, n_blobs))
    
plt.close('all')