In [1]:
# scikit learn
from sklearn.svm import SVC as svmModel
# numpy
import numpy as np

In [2]:
# datasets
from data.datasets import CannabisOneHot
from data.datasets import CannabisDummies

In [3]:
# implementations of simple kernels
import kernels.one_hot_kernels as ohk
from utils import k_fold_CV # simple function to validate with k-fold CV 

## First approach with Expanded and One-Hot encoding

In our first approach to the problem we tried to deal with categorical data changing the encoding to a Counter of occurences (expanded) and Dummy variables (one-hot). The kernels presented are implemented specifically for this kind of encoding.

In [16]:
expanded = CannabisOneHot()
dummy = CannabisDummies()

Xe, ye = expanded.generate()
Xd, yd = dummy.generate()

### Baseline kernels

The first one is the linear kernel.

In [17]:
linear_model = svmModel(kernel = "linear", C = 1)
print("Accuracy with expanded encoding", k_fold_CV(linear_model, Xe, ye, 5))
print("Accuracy with dummy encoding", k_fold_CV(linear_model, Xd, yd, 5))

Accuracy with expanded encoding 0.9025671812464265
Accuracy with dummy encoding 0.8995740423098914


The Radial basis function

In [18]:
RBF_model = svmModel(kernel = "rbf", C = 1)
print("Accuracy with expanded encoding", k_fold_CV(RBF_model, Xe, ye, 5))
print("Accuracy with dummy encoding", k_fold_CV(RBF_model, Xd, yd, 5))

Accuracy with expanded encoding 0.9335477415666096
Accuracy with dummy encoding 0.9222498570611778


### Categorical kernels

The fist one is Simple Matching Coefficient

In [19]:
smc1_model = svmModel(kernel = ohk.smc, C=1)
print("Accuracy with expanded encoding", k_fold_CV(smc1_model, Xe, ye, 5))
print("Accuracy with dummy encoding", k_fold_CV(smc1_model, Xd, yd, 5))

Accuracy with expanded encoding 0.9373156089193826
Accuracy with dummy encoding 0.9305488850771869


Then with Jaccard

In [22]:
jaccard_model = svmModel(kernel = ohk.jaccard, C=1)
print("Accuracy with expanded encoding", k_fold_CV(jaccard_model, Xe, ye, 5))
print("Accuracy with dummy encoding", k_fold_CV(jaccard_model, Xd, yd, 5))

Accuracy with expanded encoding 0.907887364208119
Accuracy with dummy encoding 0.921483704974271


And the last one is the K_0'

In [23]:
k0_model = svmModel(kernel = ohk.k0prime, C=1)
print("Accuracy with expanded encoding", k_fold_CV(k0_model, Xe, ye, 5))
print("Accuracy with dummy encoding", k_fold_CV(k0_model, Xd, yd, 5))

Accuracy with expanded encoding 0.9365637507146941
Accuracy with dummy encoding 0.9313036020583191


## Second approach with categorical data

The original dataset has been codified in a matrix of pairs (3D array in numpy) where each pair is a microsatellite. This way the kernels take advantage of this feature. Now we don't compute base line kernels because we need custom kernels.

In [4]:
from data.datasets import CannabisGenotype

In [5]:
paired_data = CannabisGenotype()

X, y = paired_data.generate()

Moreover the kernels will be loaded from another file.

In [6]:
import kernels.kernels3d as k3d

### Categorical kernels

The first one is the simple one, the Simple matching coefficient that considers both alleles.

In [7]:
smc_model = svmModel(kernel = k3d.smc, C=1)
print("Accuracy:", k_fold_CV(smc_model, X, y, 5))

Accuracy: 0.9471526586620926


The $k_0'$ kernel

In [8]:
k0_model = svmModel(kernel = k3d.k0prime, C=1)
print("Accuracy:", k_fold_CV(k0_model, X, y, 5))

Accuracy: 0.9433733562035449


### The custom kernel

After seeing that we have better results with the original encoding we decided to design a new kernel function described in the report. The implementation is as follows:

In [10]:
def compare(array):
    return array[0] == array[1]


def combined(X, Y):
    gamma = 1
    x1, x2, x3 = X.shape
    y1, y2, y3 = Y.shape
    # Compute the kernel matrix:
    G = np.zeros((x1, y1))
    Ysame = np.apply_along_axis(compare, 2, Y)
    Yrev = np.apply_along_axis(np.flip, 2, Y)
    for i in range(x1):
        Xi = np.tile(X[i], (y1, 1, 1))
        # equality kernel
        Xisame = np.apply_along_axis(compare, 2, Xi)
        equality = 2 * np.sum(Xisame == Ysame, axis = 1) / x2
        # crossed SMC
        Xirev = np.apply_along_axis(np.flip, 2, Xi)
        crossedSMC = np.sum(np.all(Xirev==Yrev, axis = 2), axis = 1) / x2
        # Normal kernel or something
        Xi = np.all(Xi == Y, axis = 2)
        k1 = np.sum(Xi, axis=1)

        G[i, :] = 0.8 * k1 / x2 + 0.1 * equality + 0.1 * crossedSMC
    return G # postf f(G)

And although it is slower to validate, the results are good:

In [12]:
combined_model = svmModel(kernel = combined, C=1)
print("Accuracy:", k_fold_CV(combined_model, X, y, 5))

Accuracy: 0.9471526586620926


## Non kernel methods

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from datasets.data import CannabisGenotype2



In [11]:
rfc_model = RandomForestClassifier(n_estimators=100)
print("Accuracy:", np.mean(cross_val_score(rfc_model, X, y, cv=5)))

Accuracy: nan


Traceback (most recent call last):
  File "/Users/jordi/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jordi/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/_forest.py", line 304, in fit
    accept_sparse="csc", dtype=DTYPE)
  File "/Users/jordi/anaconda3/lib/python3.7/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/jordi/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/Users/jordi/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 802, in check_X_y
    estimator=estimator)
  File "/Users/jordi/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/Users/jordi/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 