In [1]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import numpy as np

In [2]:
import kernels as kn
from utils import k_fold_CV
from utils import timed_k_fold_CV

In [3]:
from data.datasets import CannabisGenotype2
from data.datasets import CannabisGenotype

In [4]:
gen = CannabisGenotype2()
X, y = gen.generate()

In [5]:
X.shape

(1324, 13, 2)

In [6]:
def my_kernel_all(X, Y):
    x1, x2, x3 = X.shape
    y1, y2, y3 = Y.shape
    G = np.zeros((x1, y1))
    for i in range(x1):
        Xi = np.tile(X[i], (y1, 1, 1))
        Xi = Xi == Y
        X1 = np.all(Xi, axis = 2)
        X2 = np.any(Xi, axis = 2)
        G[i, :] = np.sum(X1, axis=1) / x2
    return G

def my_kernel_any(X, Y):
    x1, x2, x3 = X.shape
    y1, y2, y3 = Y.shape
    G = np.zeros((x1, y1))
    for i in range(x1):
        Xi = np.tile(X[i], (y1, 1, 1))
        Xi = Xi == Y
        X1 = np.any(Xi, axis = 2)
        G[i, :] = np.sum(X1, axis=1) / x2
    return G

def smc(X, Y):
    x1, x2, x3 = X.shape
    y1, y2, y3 = Y.shape
    G = np.zeros((x1, y1))
    for i in range(x1):
        Xi = np.tile(X[i], (y1, 1, 1))
        Xi = Xi == Y
        #  k([a1, a2], [b1, b2]) = a1==b1 and a2==b2
        # compares the whole microsatellite
        X1 = np.all(Xi, axis = 2)
        G[i, :] = np.sum(X1, axis=1) / x2
    return G

def count(X, Y):
    x1, x2, x3 = X.shape
    y1, y2, y3 = Y.shape
    G = np.zeros((x1, y1))
    for i in range(x1):
        Xi = np.tile(X[i], (y1, 1, 1))
        Xi = Xi == Y
        #  k([a1, a2], [b1, b2]) = a1==b1 + a2==b2
        X1 = np.sum(Xi, axis = 2)
        G[i, :] = np.sum(X1, axis=1) / x2
    return G

def count2(X, Y):
    x1, x2, x3 = X.shape
    y1, y2, y3 = Y.shape
    G = np.zeros((x1, y1))
    for i in range(x1):
        Xi = np.tile(X[i], (y1, 1, 1))
        Xi = Xi == Y
        #  k([a1, a2], [b1, b2]) = a1==b1 + a2==b2
        X1 = np.sum(Xi, axis = 2)/2
        G[i, :] = np.sum(X1, axis=1) / x2
    return G

def k0prime(X, Y):
    x1, x2, x3 = X.shape
    y1, y2, y3 = Y.shape
    G = np.zeros((x1, y1))
    for i in range(x1):
        Xi = np.tile(X[i], (y1, 1, 1))
        Xi = Xi == Y
        #  k([a1, a2], [b1, b2]) = a1==b1 + a2==b2
        X1 = np.all(Xi, axis = 2)
        G[i, :] = np.sum(X1, axis=1) / x2
    gamma = 1/4
    return np.exp(gamma * G)

def jaccard(X, Y):
    x1, x2, x3 = X.shape
    y1, y2, y3 = Y.shape
    G = np.zeros((x1, y1))
    for i in range(x1):
        Xi = np.tile(X[i], (y1, 1, 1))
        Xi = Xi == Y
        #  k([a1, a2], [b1, b2]) = a1==b1 + a2==b2
        X1 = np.all(Xi, axis = 2)
        G[i, :] = np.sum(X1, axis=1) / x2
    gamma = 1/4
    return np.exp(gamma * G)

In [7]:
mall = svm.SVC(kernel = my_kernel_all, C=1)
many = svm.SVC(kernel = my_kernel_any, C=1)
count_model = svm.SVC(kernel = count, C =1)
cmod2 = svm.SVC(kernel = count2, C =1)
k0 = svm.SVC(kernel = k0prime, C =1)

In [8]:
print(timed_k_fold_CV(mall, X, y, 5))

'timed_k_fold_CV' : 6.09 sec
0.9214951400800458


In [9]:
print(timed_k_fold_CV(count_model, X, y, 5))

'timed_k_fold_CV' : 4.15 sec
0.912438536306461


In [10]:
print(timed_k_fold_CV(cmod2, X, y, 5))

'timed_k_fold_CV' : 4.30 sec
0.9184762721555174


In [11]:
print(timed_k_fold_CV(k0, X, y, 5))

'timed_k_fold_CV' : 4.69 sec
0.9297855917667238


In [12]:
X

array([[['204', '206'],
        ['246', '249'],
        ['142', '142'],
        ...,
        ['139', '171'],
        ['239', '242'],
        ['317', '317']],

       [['164', '228'],
        ['207', '249'],
        ['142', '142'],
        ...,
        ['171', '171'],
        ['239', '239'],
        ['315', '315']],

       [['206', '206'],
        ['234', '246'],
        ['142', '142'],
        ...,
        ['171', '171'],
        ['239', '239'],
        ['320', '320']],

       ...,

       [['174', '192'],
        ['249', '252'],
        ['145', '145'],
        ...,
        ['204', '204'],
        ['242', '242'],
        ['329', '329']],

       [['156', '174'],
        ['231', '252'],
        ['145', '145'],
        ...,
        ['204', '204'],
        ['242', '245'],
        ['329', '329']],

       [['172', '190'],
        ['231', '252'],
        ['151', '151'],
        ...,
        ['204', '204'],
        ['242', '242'],
        ['332', '332']]], dtype='<U3')

In [191]:
np.unique(a, axis = 2)

array([[[1, 1],
        [1, 1]],

       [[2, 2],
        [3, 4]]])