In [1]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
# from sklearn import datasets

import numpy as np

In [2]:
# function to time the CV method
import time
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()

        print('%r : %2.2f sec' %
              (method.__name__, te - ts))
        return result

    return timed

In [3]:
@timeit
def k_fold_CV(model, X, y, folds):
    acc = np.mean(cross_val_score(model, X, y, cv=folds))
    print("model with accuracy:", acc)
    return acc

## Load the data

We load two different datasets. Both are in one-hot encoding form but were generated through different methods. We just want to see what works better before 

In [4]:
from data.datasets import CannabisOneHot
from data.datasets import CannabisDummies

In [6]:
one_hot = CannabisOneHot()
Xoh, yoh = one_hot.generate()
dummies = CannabisDummies()
Xd, yd = dummies.generate()

In [7]:
# create a baseline model
linear_svm = svm.SVC(kernel = "linear", C = 1)
radial_svm = svm.SVC(kernel = "rbf", C = 1)

#### Now, compare

In [9]:
k = 10
print("One Hot data:")
k_fold_CV(linear_svm, Xoh, yoh, k)
k_fold_CV(radial_svm, Xoh, yoh, k)

print("Dummy data:")
k_fold_CV(linear_svm, Xd, yd, k)
k_fold_CV(radial_svm, Xd, yd, k)

One Hot data:
model with accuracy: 0.9146388699020276
'k_fold_CV' : 1.41 sec
model with accuracy: 0.9517372977899292
'k_fold_CV' : 2.24 sec
Dummy data:
model with accuracy: 0.9147243107769423
'k_fold_CV' : 0.89 sec
model with accuracy: 0.9313966735019367
'k_fold_CV' : 1.66 sec


0.9313966735019367

## Custom kernels

Next we have some kernel functions to train the svm. These are speccially designed to deal with categorical data.

In [77]:
def overlap(X, Y):
    xm, xn = X.shape
    ym, yn = Y.shape
    # Compute the kernel matrix:
    G = np.zeros((xm, ym))
    for i in range(xm):
        Xi = np.tile(X[i], (ym, 1))
        Xi = Xi == Y
        G[i, :] = np.sum(Xi, axis=1)
    return G

def k0(X, Y): # what is this?
    xm, xn = X.shape
    ym, yn = Y.shape
    # Compute the kernel matrix:
    G = np.zeros((xm, ym))
    for i in range(xm):
        Xi = np.tile(X[i], (ym, 1))
        Xi = Xi == Y # prevf
        G[i, :] = np.mean(Xi, axis=1)
    return G # postf f(G)

from sklearn.metrics import jaccard_score

def jaccard(X, Y):
    xm, xn = X.shape
    ym, yn = Y.shape
    # Compute the kernel matrix:
    G = np.zeros((xm, ym))
    for i in range(xm):
        Xi = np.tile(X[i], (ym, 1))
        Xi = Xi == Y
        eq = np.sum(Xi, axis = 1)
        G[i, :] = eq / (2*xn-eq)
    return G # postf f(G)

def jaccard2(X, Y):
    xm, xn = X.shape
    ym, yn = Y.shape
    # Compute the kernel matrix:
    G = np.zeros((xm, ym))
    ytraits = np.sum(Y, axis=1)
    for i in range(xm):
        Xi = np.tile(X[i], (ym, 1))
        xtraits = np.sum(Xi, axis = 1)
        Xi = Xi == Y
        eq = np.sum(Xi, axis = 1)
        G[i, :] = eq / (xtraits+ytraits-eq)
    return G

def k0prime(X, Y):
    xm, xn = X.shape
    ym, yn = Y.shape
    G = np.zeros((xm, ym))
    for i in range(xm):
        Xi = np.tile(X[i], (ym, 1))
        Xi = Xi == Y
        G[i, :] = np.mean(Xi, axis=1)
    gamma = 1
    return np.exp(gamma * G)

In [88]:
def smc(X, Y):
    xm, xn = X.shape
    ym, yn = Y.shape
    # Compute the kernel matrix:
    G = np.zeros((xm, ym))
    for i in range(xm):
        Xi = np.tile(X[i], (ym, 1))
        Xi = Xi == Y
        G[i, :] = np.sum(Xi, axis=1) / xn
    return G

In [84]:
X_tr, X_te, y_tr, y_te = dummies.train_test_split()

In [85]:
jac = svm.SVC(kernel = smc, C = 1).fit(X_tr, y_tr)
jac.score(X_te, y_te)

361 361
361 361


0.964824120603015

In [81]:
k0prime_svm = svm.SVC(kernel = k0prime, C = 1)
k0_svm = svm.SVC(kernel = k0, C = 1)
jaccard1_svm = svm.SVC(kernel = jaccard, C = 1)
jaccard2_svm = svm.SVC(kernel = jaccard2, C = 1)

k = 5
print("One Hot data:")
k_fold_CV(k0prime_svm, Xoh, yoh, k)
k_fold_CV(k0_svm, Xoh, yoh, k)
k_fold_CV(jaccard1_svm, Xoh, yoh, k)
k_fold_CV(jaccard2_svm, Xoh, yoh, k)


print("Dummy data:")
k_fold_CV(k0prime_svm, Xd, yd, k)
k_fold_CV(k0_svm, Xd, yd, k)
k_fold_CV(jaccard1_svm, Xd, yd, k)
k_fold_CV(jaccard2_svm, Xd, yd, k)

One Hot data:
model with accuracy: 0.9365637507146941
'k_fold_CV' : 8.70 sec
model with accuracy: 0.9373156089193826
'k_fold_CV' : 7.97 sec
model with accuracy: 0.9267524299599771
'k_fold_CV' : 6.02 sec
model with accuracy: 0.907887364208119
'k_fold_CV' : 7.40 sec
Dummy data:
model with accuracy: 0.9313036020583191
'k_fold_CV' : 7.97 sec
model with accuracy: 0.9305488850771869
'k_fold_CV' : 7.90 sec
model with accuracy: 0.9335677530017152
'k_fold_CV' : 6.40 sec
model with accuracy: 0.921483704974271
'k_fold_CV' : 7.42 sec


0.921483704974271

In [87]:
def jaccard(X, Y):
    xm, xn = X.shape
    ym, yn = Y.shape
    # Compute the kernel matrix:
    G = np.zeros((xm, ym))
    for i in range(xm):
        Xi = np.tile(X[i], (ym, 1))
        Xi = jaccard_score(Xi.T, Y.T, average = None)
        G[i, :] = Xi
    return G

In [76]:
jaccard(X_train, X_train)

926 516 926 516


array([[1.        , 0.73737374, 0.68627451, ..., 0.73737374, 0.73737374,
        0.75510204],
       [0.73737374, 1.        , 0.75510204, ..., 0.71144279, 0.72      ,
        0.74619289],
       [0.68627451, 0.75510204, 1.        , ..., 0.73737374, 0.75510204,
        0.72864322],
       ...,
       [0.73737374, 0.71144279, 0.73737374, ..., 1.        , 0.72864322,
        0.71144279],
       [0.73737374, 0.72      , 0.75510204, ..., 0.72864322, 1.        ,
        0.7029703 ],
       [0.75510204, 0.74619289, 0.72864322, ..., 0.71144279, 0.7029703 ,
        1.        ]])

In [89]:
smc_svm = svm.SVC(kernel = smc, C=1)
k_fold_CV(smc_svm, Xoh, yoh, k)

model with accuracy: 0.9373156089193826
'k_fold_CV' : 9.35 sec


0.9373156089193826