# Run Experiments --- Non-Parametric Representation Learning with Kernels

# Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle, islice


# import sklearn
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics.cluster import adjusted_rand_score

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import KernelPCA
from sklearn.cluster import KMeans
import sklearn
from sklearn import neighbors
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.utils import shuffle
from sklearn import manifold
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import LeaveOneOut
from matplotlib.lines import Line2D

# import scipy
from scipy.linalg import eigh as largest_eigh
from scipy.linalg import sqrtm as sqrtm
from scipy.sparse.linalg.eigen import eigsh as largest_eigsh

from google.colab import drive
drive.mount('/content/gdrive')

%cd /content/gdrive/MyDrive/Kernel_Representation_Shared/

  from scipy.sparse.linalg.eigen import eigsh as largest_eigsh


Mounted at /content/gdrive
/content/gdrive/MyDrive/Kernel_Representation_Shared


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cpu'

# Helper Functions

In [None]:
##### Helper functions for Kernels

def kernel_function(kernel_name, X, Y,gamma):
    ### Helper function that we can use to call kernels in one line
    if kernel_name == 'Gaussian':
        k = sklearn.metrics.pairwise.rbf_kernel(X,Y,gamma = gamma)
    elif kernel_name == 'Laplacian':
        k = sklearn.metrics.pairwise.laplacian_kernel(X,Y,gamma = gamma)
    elif kernel_name == 'Sigmoid':
        k = sklearn.metrics.pairwise.sigmoid_kernel(X,Y,gamma = gamma)
    elif kernel_name == 'Linear':
        k = sklearn.metrics.pairwise.linear_kernel(X,Y)
    elif kernel_name == 'ReLU':
        k = ReLU_Kernel(X,Y,gamma)
    elif kernel_name == 'expchi2':
        k = sklearn.metrics.pairwise.chi2_kernel(X,Y,gamma = gamma)
    else:
        print('Kernel name not defined')

    return k

##### Define Kernels

def laplacian_kernel_manual(x,y):
    return np.exp(-gamma*np.linalg.norm(x-y,1))


##### Define one hidden layer ReLU Kernel - See supplementary material or Bietti and Bach 2021

def kappa_0(u):
    return 1/np.pi * (np.pi - np.arccos(u))

def kappa_1(u):
    return 1/np.pi * (u*(np.pi - np.arccos(u)) + np.sqrt(1-u**2))

def ReLU_Kernel(x,y,gamma):
    inp = (x@y.T)/gamma # Remark: gamma here is added as a normalizer as we have norm one constraint on the inputs of the relu kernel
    np.clip(inp, -1, 1, out=inp)

    K_ij = inp*kappa_0(inp) + kappa_1(inp)
    return K_ij

# ReLU Kernel inputs have to have input norm one. use this to normalise if necessary
def ReLU_Kernel_normalize(X):
    n_unlabelled = X.shape[0]
    max_list = []
    for i in range(n_unlabelled):
        for j in range(n_unlabelled):
            max_list.append(X[i,:].reshape(1, -1)@X[j,:].reshape(1, -1).T)

    gamma = np.max(max_list)
    return gamma

def ReLU_Kernel_normalize_2(X,Y):
    n_unlabelled = X.shape[0]
    max_list = []
    for i in range(n_unlabelled):
        for j in range(n_unlabelled):
            max_list.append(X[i,:].reshape(1, -1)@X[j,:].reshape(1, -1).T)

    for i in range(n_unlabelled):
        for j in range(Y.shape[0]):
            max_list.append(X[i,:].reshape(1, -1)@Y[j,:].reshape(1, -1).T)

    gamma = np.max(max_list)
    return gamma

# get normalization for the relu kernel

def contrastive_simple_ReLU_normalizer(data0, datam, datap):
    n_unlabelled = data0.shape[0]
    max_list = []
    for i in range(n_unlabelled):
        for j in range(n_unlabelled):
            max_list.append(data0[i,:].reshape(1, -1)@data0[j,:].reshape(1, -1).T)

            max_list.append(data0[i,:].reshape(1, -1)@datam[j,:].reshape(1, -1).T)
            max_list.append(data0[i,:].reshape(1, -1)@datap[j,:].reshape(1, -1).T)

            max_list.append(datap[i,:].reshape(1, -1)@datap[j,:].reshape(1, -1).T)
            max_list.append(datam[i,:].reshape(1, -1)@datam[j,:].reshape(1, -1).T)
            max_list.append(datap[i,:].reshape(1, -1)@datam[j,:].reshape(1, -1).T)
            max_list.append(datam[i,:].reshape(1, -1)@datap[j,:].reshape(1, -1).T)

    gamma = np.max(max_list)
    return gamma

##### split the data such that we obain train / test as well as positive an negative samples

def create_data_split(X,Y, N_lab,N_test,random_state,noise):
    # X: all data
    # Y: all labels

    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=N_test, random_state=random_state)

    # reference data point
    X0 = X_train.copy()
    # positive is reference + noise
    Xp = X_train.copy() + noise * np.random.normal(0,1,X_train.shape)
    # negative is indpendent sample
    Xm = X_train.copy()
    np.random.shuffle(Xm)

    X_lab = X_train[:int(X.shape[0]*N_lab),:]
    Y_lab = Y_train[:int(X.shape[0]*N_lab)]

    return X_train,X_lab,Y_lab,X_test,Y_test,X0,Xp,Xm

# Define Kernel Functions

## Simple contrastive

we follow directly the setting defined in Theorem 3:

we use p,m as +,-.
Deefine the Kernel matirces as
$$
\begin{array}{ll}
\boldsymbol{K}=\left[k\left(x_i, x_j\right)\right]_{i, j} & \boldsymbol{K}_{-}=\left[k\left(x_i, x_j^{-}\right)\right]_{i, j} \\
\boldsymbol{K}_{+}=\left[k\left(x_i, x_j^{+}\right)\right]_{i, j} & \boldsymbol{K}_{--}=\left[k\left(x_i^{-}, x_j^{-}\right)\right]_{i, j} \\
\boldsymbol{K}_{++}=\left[k\left(x_i^{+}, x_j^{+}\right)\right]_{i, j} & \boldsymbol{K}_{-+}=\left[k\left(x_i^{-}, \boldsymbol{x}_j^{+}\right)\right]_{i, j}
\end{array}
$$

Furthermore, define the matrices
$$
\begin{aligned}
& K_{\Delta}=K_{--}+K_{++}-K_{-+}-K_{-+}^T \quad K_1=\left[\begin{array}{cc}
\boldsymbol{K} & \boldsymbol{K}_{-}-\boldsymbol{K}_{+} \\
\boldsymbol{K}_{-}-\boldsymbol{K}_{+} & \boldsymbol{K}_{\Delta}
\end{array}\right] \\
& \boldsymbol{B}=\left[\begin{array}{c}
\boldsymbol{K}_{-}-\boldsymbol{K}_{+} \\
\boldsymbol{K}_{\Delta}
\end{array}\right] \cdot\left[\begin{array}{ll}
\boldsymbol{K} & \boldsymbol{K}_{-}-\boldsymbol{K}_{+}
\end{array}\right] \quad \boldsymbol{K}_2=-\frac{1}{2}\left(\boldsymbol{B}+\boldsymbol{B}^T\right) \\
&
\end{aligned}
$$
Let $A_2$ consist of the top h eigenvectors of the matrix $\boldsymbol{K}_1^{-1 / 2} \boldsymbol{K}_2 \boldsymbol{K}_1^{-1 / 2}$ and $\boldsymbol{A}=\boldsymbol{K}_1^{-1 / 2} \boldsymbol{A}_2$
Then at optimal parameterization, the embedding of any $x^*$ can be written as
$$
z^*=A^T\left[\begin{array}{c}
k\left(x^*, \boldsymbol{X}\right) \\
k\left(x^*, \boldsymbol{X}^{-}\right)-k\left(x^*, \boldsymbol{X}^{+}\right)
\end{array}\right]
$$

In [None]:
#### Function computting the embedding for the train and test set for the
# simple contrastive kernel

def compute_contrastive_simple_embedding(data0, datam, datap, dataLab, dataTest,
                                         gamma, k, kernel = 'Gaussian'):

    # data0, datam, datap: reference, negative and positive samples
    # dataLab, dataTest: labeled and test data
    # gamma: kernel parameterizaiton
    # k: hidden dimension
    # kernel: define kernel name

    n_unlabelled = data0.shape[0]

    if kernel == 'ReLU':
        gamma = contrastive_simple_ReLU_normalizer(data0, datam, datap)

    K = kernel_function(kernel,data0,data0,gamma = gamma)
    K_m = kernel_function(kernel,data0,datam,gamma = gamma)
    K_p = kernel_function(kernel,data0,datap,gamma = gamma)
    K_pp = kernel_function(kernel,datap,datap,gamma = gamma)
    K_mm = kernel_function(kernel,datam,datam,gamma = gamma)
    K_mp = kernel_function(kernel,datam,datap,gamma = gamma)

    K_Delta = K_mm + K_pp - K_mp - K_mp.T

    K1 = np.block([[K,          K_m - K_p   ],
                   [K_m - K_p,  K_Delta     ]])

    B = np.block([[K_m - K_p],
                  [K_Delta ]]) @ np.block([[K, K_m - K_p]])

    K_2 = -0.5*(B + B.T)

    K_1_sqrt = sqrtm(K1)

    evals_large, evecs_large = largest_eigh(K_1_sqrt @ K_2 @ K_1_sqrt, subset_by_index=(n_unlabelled-k,n_unlabelled-1))
    A_2 = np.real(evecs_large)

    A = K_1_sqrt @ A_2

    test_embed = A.T @ np.block([[[kernel_function(kernel,dataTest,data0,gamma = gamma).T],
                                  [np.array(kernel_function(kernel,dataTest,datam,gamma = gamma) - kernel_function(kernel,dataTest,datap,gamma = gamma)).T]]])

    train_embed = A.T @ np.block([[[kernel_function(kernel,dataLab,data0,gamma = gamma).T],
                                  [np.array(kernel_function(kernel,dataLab,datam,gamma = gamma) - kernel_function(kernel,dataLab,datap,gamma = gamma)).T]]])

    return np.squeeze(train_embed.real).T, np.squeeze(test_embed.real).T

## Spectral contrastive

Recall we compute the gradients as
$$
\nabla \mathcal{L}^{\mathrm{Sp}}=2 \lambda \boldsymbol{Z} \boldsymbol{K}^{-1}+ \begin{cases}-2 \boldsymbol{z}_{i+n}+2\left(\boldsymbol{z}_i^T \boldsymbol{z}_{i+2 n}\right) \boldsymbol{z}_{i+2 n} & , i \in[n] \\ -2 \boldsymbol{z}_{i-n} & , i \in[n+1,2 n] \\ 2\left(\boldsymbol{z}_i^T \boldsymbol{z}_{i-2 n}\right) z_{i-2 n} & , i \in[2 n+1,3 n]\end{cases}
$$

and we can compute a new point using
$$
\boldsymbol{z}^*:=\boldsymbol{Z} \boldsymbol{K}^{-1} k\left(\boldsymbol{X}, \boldsymbol{x}^*\right).
$$

Remark: on thing to observe is that this setup need very very small learning rates and lambda. also if you run over differnt gammas you will see that in a lot of cases the caluclation breaks and gives a lot of error messages. However when you plot the loss, if you find a good gamma it converges


In [None]:
def spectral_loss(Z,K_inv,lambda_):
    # compute spectral loss function
    # not directly used but can be used to check if loss decreases
    n = int(Z.shape[1]/3)
    loss = 1/n * np.sum(-2*np.diag(Z[:,:n].T @ Z[:,n:2*n]) + (np.diag(Z[:,:n].T @ Z[:,2*n:]))**2) + lambda_*np.trace(Z @ K_inv @ Z.T)
    return loss

def update_spectral_loss(Z,K_inv,lambda_,lr):
    # compute gradient step for the spectral loss.
    # updates are as in Th.1.3, stacked
    n = int(Z.shape[1]/3)
    term_1 = 2 * lambda_ * Z @ K_inv
    term_2 = np.hstack((-2*Z[:,n:2*n] + 2*(np.diag(Z[:,:n].T @ Z[:,2*n:]).T) * Z[:,2*n:],
                       -2*Z[:,:n],
                       2*(np.diag(Z[:,2*n:].T @ Z[:,:n]).T) * Z[:,:n]
                       ))
    return lr*( term_1 + term_2)

def compute_contrastive_spectral_embedding(data0, datam, datap, dataLab, dataTest,
                                           gamma ,kernel, hidden_size = 2, lambda_  = 1.0e-3,
                                           lr = 1.0e-10, epochs = 100):

    # data0, datam, datap: reference, negative and positive samples
    # dataLab, dataTest: labeled and test data
    # gamma: kernel parameterizaiton
    # hidden_size: hidden dimension
    # kernel: define kernel name
    # lambda_: regulariser
    # lr: learning rate
    # epochs: number of epochs

    X_all = np.concatenate((data0,datap,datam), axis = 0)
    K_inv = np.linalg.pinv(kernel_function(kernel, X_all, X_all, gamma))
    Z = np.random.normal(0,0.1,(hidden_size, data0.shape[0]*3))

    loss_list = []
    for epoch in range(epochs):
        loss = spectral_loss(Z,K_inv,lambda_)
        loss_list.append(loss)
        Z = Z - update_spectral_loss(Z,K_inv,lambda_,lr)

    embedd_test = (Z @ K_inv @ np.linalg.pinv(kernel_function(kernel, dataTest, X_all, gamma))).T
    embedd_train = (Z @ K_inv @ np.linalg.pinv(kernel_function(kernel, dataLab, X_all, gamma))).T

    scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(embedd_train)
    embedd_train = scaling.transform(embedd_train)
    embedd_test = scaling.transform(embedd_test)

    return embedd_train,  embedd_test, loss_list

## Kernel AE Reconstruction

In [None]:
def Zstep(Q, q, gamma, kernel):
    # update embedding
    if kernel == 'ReLU': gamma = ReLU_Kernel_normalize(Q)
    K_Q = kernel_function(kernel, Q, Q, gamma)
    _, eigenvectors = np.linalg.eigh(K_Q)
    eigenvectors_descending = eigenvectors[:, ::-1]
    Z = eigenvectors_descending[:, :q]

    return(Z)

def Qstep(Z, X, gamma, kernel):
    # update reconstruction
    if kernel == 'ReLU': gamma = ReLU_Kernel_normalize(Z)
    K_Z = kernel_function(kernel, Z, Z, gamma)
    Q = K_Z @ np.linalg.inv(K_Z + np.eye(np.shape(X)[0])) @ X

    return(Q)

def optimize_Bottleneck_Kernel_AE_iterative(X_train, X_test, X_lab,epochs,
                                            kernel,gamma,embeddin_size,
                                            return_embed = True,noise = 0):

    # function can be used either to obtain the embeddings or de-noising
    # X_train, dataLab, dataTest: train, labeled and test data
    # gamma: kernel parameterizaiton
    # embeddin_size: hidden dimension
    # kernel: define kernel name
    # return_embed if true return embedding, if not return
    # noise if set to zero we have a standard AE setting if not de-nosing

    np.random.seed(0)
    X_test_Noise = X_test.copy() + noise * np.random.normal(0,1,X_test.shape)
    X_train_Noise = X_train.copy() + noise * np.random.normal(0,1,X_train.shape)
    X_lab_Noise = X_lab.copy() + noise * np.random.normal(0,1,X_lab.shape)

    train_loss = [ ]

    Q_train = X_train
    for _ in range(epochs):
        Z = Zstep(Q_train, embeddin_size ,gamma, kernel)
        Q_train = Qstep(Z, X_train, gamma, kernel)

        loss = np.linalg.norm(Q_train-X_train)**2 / torch.tensor(X_train).shape[0]
        train_loss.append(loss)


    if return_embed:
        if kernel == 'ReLU': gamma = ReLU_Kernel_normalize_2(X_train,X_test)
        Out_test = (Z.T @ np.linalg.pinv(kernel_function(kernel,X_train,X_train,gamma)) @ kernel_function(kernel,X_train, X_test,gamma)).T

        if kernel == 'ReLU': gamma = ReLU_Kernel_normalize_2(X_train,X_lab)
        Out_lab = (Z.T @ np.linalg.pinv(kernel_function(kernel,X_train,X_train,gamma)) @ kernel_function(kernel,X_train,X_lab,gamma)).T

    else:
        if kernel == 'ReLU': gamma = ReLU_Kernel_normalize_2(X_train,X_test)
        Z_new = Z.T @ np.linalg.inv(kernel_function(kernel, X_train_Noise, X_train_Noise, gamma)) @ kernel_function(kernel, X_train_Noise, X_test_Noise, gamma)
        K_Z = kernel_function(kernel, Z, Z, gamma)
        K_Z_new = kernel_function(kernel, Z, Z_new.T, gamma)
        Q_test = K_Z_new.T @ np.linalg.inv(K_Z + np.eye(np.shape(K_Z)[0])) @ X_train
        Out_test = np.linalg.norm(Q_test-X_test)**2 / torch.tensor(X_train).shape[0]

        if kernel == 'ReLU': gamma = ReLU_Kernel_normalize_2(X_train,X_lab)
        Z_new = Z.T @ np.linalg.inv(kernel_function(kernel, X_train_Noise, X_train_Noise, gamma)) @ kernel_function(kernel, X_train_Noise, X_lab_Noise, gamma)
        K_Z = kernel_function(kernel, Z, Z, gamma)
        K_Z_new = kernel_function(kernel, Z, Z_new.T, gamma)
        Q_lab = K_Z_new.T @ np.linalg.inv(K_Z + np.eye(np.shape(K_Z)[0])) @ X_train
        Out_lab = np.linalg.norm(Q_lab-X_lab)**2 / torch.tensor(X_train).shape[0]


    return train_loss, Out_lab,  Out_test

# Neural Networks

Define the neural network models corresponding to the proposed kernel methods. the setup are standard relu neural networks with the considered contrastive loss functions, AE as well as a neural network performing classification directly on the labelled data.

## Contrastive

In [None]:
class ReLU_NN_orthogonal(nn.Module):
    def __init__(self, in_feature, embed):
        super(ReLU_NN_orthogonal, self).__init__()
        # encoder
        self.enc1 = torch.nn.utils.parametrizations.orthogonal(
            nn.Linear(in_features=in_feature, out_features=embed, bias=False))
        # decoder
        self.dec1 = torch.nn.utils.parametrizations.orthogonal(
            nn.Linear(in_features=embed, out_features=1, bias=False))

    def forward(self, x):
        x = self.enc1(x)
        x = F.relu(x)
        x = self.dec1(x)

        return x

class ReLU_NN(nn.Module):
    def __init__(self, in_feature, embed):
        super(ReLU_NN, self).__init__()
        # encoder
        self.enc1 = nn.Linear(in_features=in_feature, out_features=embed, bias=False)
        # decoder
        self.dec1 = nn.Linear(in_features=embed, out_features=1, bias=False)

    def forward(self, x):
        x = self.enc1(x)
        x = F.relu(x)
        x = self.dec1(x)

        return x

def simple_contrastive_loss(f0, fp, fm):
    return torch.mean(f0.T@(fm-fp))

def spectral_contrastive_loss(f0, fp, fm):
    return torch.mean(-2*(f0.T@fm) + (f0.T@fp)**2)

def contrastive_NN_train(X0, Xp, Xm, dataLab, Xtest, embedding_size,
                         contrastive_loss = 'simple', lr=0.1, epochs=100):

    # contrastive_loss: pass either 'simple' or 'spectral' to change the way the loss is defined

    X0 = torch.from_numpy(X0).to(device)
    Xp = torch.from_numpy(Xp).to(device)
    Xm = torch.from_numpy(Xm).to(device)

    if contrastive_loss == 'simple':
        net = ReLU_NN_orthogonal(in_feature=X0.shape[1],embed=embedding_size).to(device)
    elif contrastive_loss == 'spectral':
        net = ReLU_NN(in_feature=X0.shape[1],embed=embedding_size).to(device)

    optimizer = optim.SGD(net.parameters(), lr=lr)

    train_loss = []
    out_diff = []

    for epoch in range(epochs):

        optimizer.zero_grad()

        out0 = net(X0.float())
        outp = net(Xp.float())
        outm = net(Xm.float())
        if contrastive_loss == 'simple':
            total_loss = simple_contrastive_loss(out0, outp, outm)
        elif contrastive_loss == 'spectral':
            total_loss = spectral_contrastive_loss(out0, outp, outm)
        else:
            print('Neural network loss not defined')
        train_loss.append(total_loss.item())
        total_loss.backward()
        optimizer.step()

    NN_dataLab_embed = net(torch.tensor(dataLab).to(device).float()).detach().cpu().numpy()
    NN_Xtestembed = net(torch.tensor(Xtest).to(device).float()).detach().cpu().numpy()

    return NN_dataLab_embed, NN_Xtestembed

## AE
simple AE with relu activation functions

### Get embedding

In [None]:
class Bottleneck_Autoencoder(nn.Module):
    def __init__(self, in_feature, hidden, embed):
        super(Bottleneck_Autoencoder, self).__init__()

        # encoder
        self.enc1 = nn.Linear(in_features=in_feature, out_features=hidden, bias=False)
        self.enc2 = nn.Linear(in_features=hidden, out_features=embed, bias=False)

        # decoder
        self.dec1 = nn.Linear(in_features=embed, out_features=hidden, bias=False)
        self.dec2 = nn.Linear(in_features=hidden, out_features=in_feature, bias=False)

    def forward(self, x):
        x = self.enc1(x)
        x = F.relu(x)
        x = self.enc2(x)
        x = self.dec1(x)
        x = F.relu(x)
        x = self.dec2(x)

        return x

def train_AE(net, X,X_noise, lr=0.1, epochs=100):


    X = torch.tensor(X).to(device)
    X_noise = torch.tensor(X_noise).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.SGD(net.parameters(), lr=lr)
    train_loss = []

    for epoch in range(epochs):
        optimizer.zero_grad()
        out = net(X_noise.float())
        loss = criterion(out, X.float())
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())

    return train_loss, net

def bottleneck_AE_NN_train(X_lab,X_unlab,X_test,embed,hidden,lr,epochs):

    net = Bottleneck_Autoencoder(in_feature=torch.tensor(X_unlab).shape[1],embed=embed,hidden = hidden).to(device)
    train_loss,net = train_AE(net, torch.tensor(X_unlab).float(),torch.tensor(X_unlab).float(), lr=lr, epochs=epochs)

    # get weigths so embedding can be computed.
    W2 = net.enc2.weight.detach()
    W1 = net.enc1.weight.detach()

    # compute embeddings and move them to numpy arrays
    embed_lab = F.relu(torch.tensor(X_lab).float() @ W1.T) @ W2.T
    embed_lab = embed_lab.detach().cpu().numpy()

    embed_test = F.relu(torch.tensor(X_test).float() @ W1.T) @ W2.T
    embed_test = embed_test.detach().cpu().numpy()

    return embed_lab, embed_test

### de-noising AE

In [None]:
def train_DAE(net, X,X_noise, lr=0.1, epochs=100):


    X = torch.tensor(X).to(device)
    X_noise = torch.tensor(X_noise).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.SGD(net.parameters(), lr=lr)
    train_loss = []

    for epoch in range(epochs):
        optimizer.zero_grad()
        out = net(X_noise.float())
        loss = criterion(out, X.float())
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())

    return train_loss, net

def bottleneck_DAE_NN_train(X_lab,X_unlab,X_test,noise,embed,hidden,lr,epochs):

    np.random.seed(0)

    X_lab_Noise = X_lab.copy() + noise * np.random.normal(0,1,X_lab.shape)
    X_unlab_Noise = X_unlab.copy() + noise * np.random.normal(0,1,X_unlab.shape)
    X_test_Noise = X_test.copy() + noise * np.random.normal(0,1,X_test.shape)

    net = Bottleneck_Autoencoder(in_feature=torch.tensor(X_unlab).shape[1],embed=embed,hidden = hidden).to(device)
    train_loss,net = train_DAE(net, torch.tensor(X_unlab).float(),torch.tensor(X_unlab_Noise).float(), lr=lr, epochs=epochs)

    out_test = net(torch.tensor(X_test_Noise).float()).detach().cpu().numpy()
    test_loss = np.linalg.norm(out_test-X_test)**2 / torch.tensor(X_test).shape[0]

    out_val = net(torch.tensor(X_lab_Noise).float()).detach().cpu().numpy()
    val_loss = np.linalg.norm(out_val-X_lab)**2 / torch.tensor(X_lab).shape[0]

    return train_loss, val_loss, test_loss

## Classification

Simple classification network as benchmark

In [None]:
class ReLU_NN_classification(nn.Module):
    def __init__(self, in_feature, hidden):
        super(ReLU_NN_classification, self).__init__()
        self.enc1 = nn.Linear(in_features=in_feature, out_features=hidden, bias=False)
        self.dec1 = nn.Linear(in_features=hidden, out_features=1, bias=False)

    def forward(self, x):
        x = self.enc1(x)
        x = F.relu(x)
        x = self.dec1(x)

        return x

def accuracy(out, labels):
    _,pred = torch.max(out, dim=1)
    return torch.sum(pred==labels).item()


def train_classification_NN(X_train,Y_train,X_test,Y_test,
                            hidden = 200, lr=0.1, epochs=100):

    net = ReLU_NN_classification(in_feature=X_train.shape[1],
                                 hidden=hidden).to(device)

    X_train = torch.from_numpy(X_train).to(device)
    Y_train = torch.from_numpy(Y_train).to(device)
    X_test = torch.from_numpy(X_test).to(device)
    Y_test = torch.from_numpy(Y_test).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=lr)
    train_loss = []

    for epoch in range(epochs):
        optimizer.zero_grad()
        out = net(X_train.float())
        loss = criterion(torch.squeeze(out).float(), Y_train.float())
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        net.eval()
        outputs_t = net(X_test.float())
        _,pred_t = torch.max(outputs_t, dim=1)
        correct_t = torch.sum(pred_t==Y_test).item()
        total_t = Y_test.size(0)
    print(correct_t,correct_t/total_t)

    return correct_t/total_t

In [None]:
def run_de_noising_experiment(X,Y,noise,N_lab,N_test,rep,gamma_list,data_set_name,epochs = 200):

    all_mse_list = []

    for r in range(rep):
        print(r)

        rep_temp = []

        X_train,X_lab,Y_lab,X_test,Y_test,X0,Xp,Xm = create_data_split(X,Y, N_lab,N_test,r,noise=0.1)

        _, _, test_loss = bottleneck_DAE_NN_train(X_lab,X_train,X_test,noise,embed=100,hidden=5,lr=0.1,epochs=epochs)
        rep_temp.append(test_loss)

        for kernel in ['Linear', 'ReLU']:
            print(kernel)
            try:
                _, _, test_loss = optimize_Bottleneck_Kernel_AE_iterative(X_train, X_test,X_lab,noise,epochs=epochs,kernel=kernel,gamma=1,embeddin_size=5,
                                                                          return_embed=False,noise = 0.1)
                rep_temp.append(test_loss)
            except:
                rep_temp.append(np.nan)

        for kernel in ['Gaussian','Laplacian']:
            print(kernel)
            temp_val = []
            temp_test = []
            for gamma in gamma_list:
                try:
                    _, val_loss, test_loss = optimize_Bottleneck_Kernel_AE_iterative(X_train, X_test,X_lab,noise,epochs=epochs,kernel=kernel,gamma=1,embeddin_size=5,
                                                                                     return_embed=False,noise = 0.1)
                    temp_val.append(val_loss)
                    temp_test.append(test_loss)
                except:
                    temp_val.append(np.nan)
                    temp_test.append(np.nan)

            rep_temp.append(np.array(temp_test)[np.argmin(np.array(temp_val))])

        all_mse_list.append(rep_temp)
    np.save('de_noising/de_noising_{}'.format(data_set_name),np.array(all_mse_list))
    return(all_mse_list)


# Run NN


In [None]:
# Get SVM and KNN accuracy for a given labeled dataset and test

def SVM_KNN(X_lab, Y_lab, X_test,Y_test):
    try:
        neigh = KNeighborsClassifier(n_neighbors=3)
        neigh.fit(X_lab, Y_lab)
        neigh_predict = neigh.predict(X_test)
        neigh_predict_accuracy = accuracy_score(Y_test, neigh_predict)
    except:
        neigh_predict_accuracy = 0

    # sometimes SVM computation breaks
    try:
        clf = svm.SVC(kernel='linear')
        clf.fit(X_lab,Y_lab)
        SVM_predict = clf.predict(X_test)
        SVM_predict_accuracy = accuracy_score(Y_test, SVM_predict)
    except:
        SVM_predict_accuracy = 0

    return neigh_predict_accuracy, SVM_predict_accuracy

function that runs over all NN models and stores K-nn and SVM accuracies for on the embeddings

In [None]:
def run_NN(X,Y,N_test,N_lab,rep,scale=False,embedding_size = 2):
    all_benchmark_list = []

    for r in range(rep):

        accuracy_list = []

        # get data splits
        X_train,X_lab,Y_lab,X_test,Y_test,X0,Xp,Xm = create_data_split(X,Y, N_lab,N_test,r,noise=0.1)

        ## NN reference clasefier

        class_NN = train_classification_NN(X_lab,Y_lab,X_test,Y_test, hidden = 20, lr=0.1, epochs=100)
        accuracy_list.append(class_NN)

        # NN simple contrastive

        embed_lab, embed_test = contrastive_NN_train(X0, Xp, Xm, X_lab, X_test, embedding_size, contrastive_loss = 'simple', lr=0.1, epochs=100)

        neigh_predict_accuracy, SVM_predict_accuracy = SVM_KNN(embed_lab, Y_lab, embed_test,Y_test)
        accuracy_list.append(neigh_predict_accuracy)
        accuracy_list.append(SVM_predict_accuracy)

        # NN spectral contrastive

        embed_lab, embed_test = contrastive_NN_train(X0, Xp, Xm, X_lab, X_test, embedding_size, contrastive_loss = 'spectral', lr=0.1, epochs=100)

        neigh_predict_accuracy, SVM_predict_accuracy = SVM_KNN(embed_lab, Y_lab, embed_test,Y_test)
        accuracy_list.append(neigh_predict_accuracy)
        accuracy_list.append(SVM_predict_accuracy)

        # NN AE

        embed_lab, embed_test = bottleneck_AE_NN_train(X_lab,X_train,X_test,embed=2,hidden = 200, lr=0.1, epochs=100)

        neigh_predict_accuracy, SVM_predict_accuracy = SVM_KNN(embed_lab, Y_lab, embed_test,Y_test)
        accuracy_list.append(neigh_predict_accuracy)
        accuracy_list.append(SVM_predict_accuracy)


        all_benchmark_list.append(accuracy_list)

    np.save('embeddings/all_NN_list{}_{}_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),all_benchmark_list)



# Run all Kernel models and get embeddings

Function that runs over Kernel PCA and all kernel methods.

for Linear and ReLU kernel we run only once for every repetition.
for gaussian and laplacian over all gamma.
everything is stored in two different np files, that are then loaded for comuputing accuracy.

In [None]:
def create_embedddings(X,Y,N_lab,N_test,rep,gamma_list,data_set_name,scale = False,embedding_size=2):

    def laplacian_kernel_manual(x,y):
        return np.exp(-gamma*np.linalg.norm(x-y,1))

    def kappa_0(u):
        return 1/np.pi * (np.pi - np.arccos(u))

    def kappa_1(u):
        return 1/np.pi * (u*(np.pi - np.arccos(u)) + np.sqrt(1-u**2))

    def ReLU_Kernel_PCA(x,y):
        inp = (x@y.T)/gamma
        if inp <-1: inp = -1
        if inp >1: inp = 1
        K_ij = inp*kappa_0(inp) + kappa_1(inp)
        return K_ij

    embed_test_list = []
    embed_lab_list = []
    embed_lin_test_list = []
    embed_lin_lab_list = []
    Y_lab_list = []
    Y_test_list = []

    if scale:
        scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
        X = scaling.transform(X)

    # run over repetions of different data splits
    for r in range(rep):

        # get data splits
        X_train,X_lab,Y_lab,X_test,Y_test,X0,Xp,Xm = create_data_split(X,Y, N_lab,N_test,r,noise=0.1)
        print('rep: {} || Train: {} Lab: {} Test: {} '.format(r,X_train.shape,X_lab.shape,X_test.shape))

        Y_test_list.append(Y_test)
        Y_lab_list.append(Y_lab)

        ######################################### KERNEL PCA

        print(' - Kernel PCA')
        gamma = ReLU_Kernel_normalize(X_train)
        for kernel in ['linear', ReLU_Kernel_PCA]:

            kernel_pca = KernelPCA(n_components=embedding_size, kernel=kernel)
            kernel_pca_train = kernel_pca.fit(X_train)

            X_test_kernel_pca = kernel_pca_train.transform(X_test)
            X_lab_kernel_pca = kernel_pca_train.transform(X_lab)
            embed_lin_test_list.append(X_test_kernel_pca)
            embed_lin_lab_list.append(X_lab_kernel_pca)

        kernel_name_list = ['rbf', laplacian_kernel_manual]

        for kernel in kernel_name_list:
            test_temp = []
            lab_temp = []
            for g in gamma_list:

                kernel_pca = KernelPCA(n_components=embedding_size, kernel=kernel,gamma = g)
                try:
                    kernel_pca_train = kernel_pca.fit(X_train)
                    X_test_kernel_pca = kernel_pca_train.transform(X_test)
                    X_lab_kernel_pca = kernel_pca_train.transform(X_lab)
                except:
                    print("PCA computation error for gamma {}".format(g))
                    X_test_kernel_pca = np.zeros((X_test.shape[0],embedding_size))
                    X_lab_kernel_pca =  np.zeros((X_lab.shape[0],embedding_size))
                test_temp.append(X_test_kernel_pca)
                lab_temp.append(X_lab_kernel_pca)
            embed_test_list.append(test_temp)
            embed_lab_list.append(lab_temp)

        ######################################### SIMPLE CONTRASTIVE

        print(' - simple contrastive')
        for kernel in ['Linear', 'ReLU']:
            try:
                Z_lab, Z_test = compute_contrastive_simple_embedding(X0, Xm, Xp, X_lab, X_test, g, k=embedding_size, kernel = kernel)
            except:
                print("spectral error for {}".format(kernel))
                Z_test = np.zeros((X_test.shape[0],embedding_size))
                Z_lab = np.zeros((X_lab.shape[0],embedding_size))
            embed_lin_test_list.append(Z_test)
            embed_lin_lab_list.append(Z_lab)

        kernel_name_list = ['Gaussian','Laplacian']
        for kernel in kernel_name_list:
            test_temp = []
            lab_temp = []
            for g in gamma_list:
                try:
                    Z_lab,  Z_test = compute_contrastive_simple_embedding(X0, Xm, Xp, X_lab, X_test, g, k=embedding_size, kernel = kernel)
                except:
                    print("simple contrastive for gamma {}".format(g))
                    Z_test = np.zeros((X_test.shape[0],embedding_size))
                    Z_lab = np.zeros((X_lab.shape[0],embedding_size))
                test_temp.append(Z_test)
                lab_temp.append(Z_lab)
            embed_test_list.append(test_temp)
            embed_lab_list.append(lab_temp)


        ######################################### SPECTRAL CONTRASTIVE

        print(' - spectral contrastive')
        lambda_  = 1.0e-5
        lr =  1.0e-10
        epochs = 1000

        for kernel in ['Linear', 'ReLU']:
            try:
                Z_lab, Z_test, _ = compute_contrastive_spectral_embedding(X0, Xm, Xp, X_lab, X_test,
                                                    gamma=g ,kernel = kernel, hidden_size = embedding_size,lambda_  = lambda_, lr =  lr,epochs = epochs)
            except:
                print("spectral error for {}".format(kernel))
                Z_test = np.zeros((X_test.shape[0],embedding_size))
                Z_lab = np.zeros((X_lab.shape[0],embedding_size))
            embed_lin_test_list.append(Z_test)
            embed_lin_lab_list.append(Z_lab)

        kernel_name_list = ['Gaussian','Laplacian']
        for kernel in kernel_name_list:
            test_temp = []
            lab_temp = []
            for g in gamma_list:
                try:
                    Z_lab, Z_test, _ = compute_contrastive_spectral_embedding(X0, Xm, Xp, X_lab, X_test,
                                                gamma=g ,kernel = kernel, hidden_size = embedding_size,lambda_  = lambda_, lr =  lr,epochs = epochs)
                except:
                    print("spectral contrastive error for gamma {}".format(g))
                    Z_test = np.zeros((X_test.shape[0],embedding_size))
                    Z_lab = np.zeros((X_lab.shape[0],embedding_size))
                test_temp.append(Z_test)
                lab_temp.append(Z_lab)
            embed_test_list.append(test_temp)
            embed_lab_list.append(lab_temp)

        ######################################### BOTTLENECK

        print(' - bottleneck')
        for kernel in ['Linear', 'ReLU']:
            epochs = 20
            gamma = 1
            _, Z_lab,  Z_test = optimize_Bottleneck_Kernel_AE_iterative(X_train, X_test,X_lab,epochs=epochs,kernel=kernel,gamma=1,embeddin_size=embedding_size)

            embed_lin_test_list.append(Z_test)
            embed_lin_lab_list.append(Z_lab)

        kernel_name_list = ['Gaussian','Laplacian']
        for kernel in kernel_name_list:
            test_temp = []
            lab_temp = []
            for g in gamma_list:

                try:
                    _, Z_lab, Z_test = optimize_Bottleneck_Kernel_AE_iterative(X_train, X_test,X_lab,epochs=epochs,
                                                                            kernel=kernel,gamma=g,embeddin_size=embedding_size)
                except:
                    print('Bottleneck failed for gamma {}'.format(g))
                    Z_test = np.zeros((X_test.shape[0],embedding_size))
                    Z_lab=  np.zeros((X_lab.shape[0],embedding_size))

                test_temp.append(Z_test)
                lab_temp.append(Z_lab)
            embed_test_list.append(test_temp)
            embed_lab_list.append(lab_temp)


    np.save('embeddings/embed{}_{}_lin_test_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),embed_lin_test_list)
    np.save('embeddings/embed{}_{}_lin_lab_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),embed_lin_lab_list)
    np.save('embeddings/embed{}_{}_test_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),embed_test_list)
    np.save('embeddings/embed{}_{}_lab_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),embed_lab_list)
    np.save('embeddings/Y_lab{}_{}_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),Y_lab_list)
    np.save('embeddings/Y_test{}_{}_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),Y_test_list)



# Downstream Task

Leave one out valiation functions

In [None]:
def get_validation_Knn(y_lab,x_lab):
    acc_list = []
    loo = LeaveOneOut()
    for i, (train_index, test_index) in enumerate(loo.split(x_lab)):
        neigh = KNeighborsClassifier(n_neighbors=3)
        neigh.fit(x_lab[train_index,:],y_lab[train_index])
        neigh_predict = neigh.predict(x_lab[test_index,:])
        neigh_predict_accuracy = accuracy_score(y_lab[test_index], neigh_predict)
        acc_list.append(neigh_predict_accuracy)
    return np.mean(np.array(acc_list))

def get_validation_SVM(y_lab,x_lab):
    acc_list = []
    loo = LeaveOneOut()
    for i, (train_index, test_index) in enumerate(loo.split(x_lab)):
        neigh = svm.SVC(kernel='linear')
        neigh.fit(x_lab[train_index,:],y_lab[train_index])
        neigh_predict = neigh.predict(x_lab[test_index,:])
        neigh_predict_accuracy = accuracy_score(y_lab[test_index], neigh_predict)
        acc_list.append(neigh_predict_accuracy)
    return np.mean(np.array(acc_list))

In [None]:
def choose_best_gamma(rep,gamma_list,data_set_name,scale = False,embedding_size = 2):

    # Load embedings

    num_kernel = 2
    num_gamma = len(gamma_list)

    SVM_test_list = []
    Knn_test_list = []

    embed_test_list = np.load('embeddings/embed{}_{}_test_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),allow_pickle=True)
    embed_lab_list = np.load('embeddings/embed{}_{}_lab_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),allow_pickle=True)
    Y_test_list = np.load('embeddings/Y_test{}_{}_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),allow_pickle=True)
    Y_lab_list = np.load('embeddings/Y_lab{}_{}_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),allow_pickle=True)

    print(embed_lab_list.shape)

    # run over saved embeddings and for each list of gammas use leave on oute
    # validation to pick the parameterisation for test

    idx = 0
    for r in range(rep):
        y_lab = Y_lab_list[r,:]
        y_test = Y_test_list[r,:]
        temp_SVM_test_list = []
        temp_Knn_test_list = []

        for i in range(num_kernel*4):

            ######### KNN
            gamma_val_list = []
            for j in range(num_gamma):
                x_lab = embed_lab_list[idx,j,:,:]
                try:
                    val_accuracy = get_validation_Knn(y_lab,x_lab)
                except:
                    val_accuracy = 0
                gamma_val_list.append(val_accuracy)

            try:
                max_idx = np.argmax(np.array(gamma_val_list))
                neigh = KNeighborsClassifier(n_neighbors=3)
                neigh.fit(embed_lab_list[idx,max_idx,:,:],y_lab)
                neigh_predict = neigh.predict(embed_test_list[idx,max_idx,:,:])
                neigh_predict_accuracy = accuracy_score(y_test, neigh_predict)
                temp_Knn_test_list.append(neigh_predict_accuracy)
            except:
                temp_Knn_test_list.append(0)

            ######### SVM
            gamma_val_list = []

            for j in range(num_gamma):
                x_lab = embed_lab_list[idx,j,:,:]
                try:
                    val_accuracy = get_validation_SVM(y_lab,x_lab)
                except:
                    val_accuracy = 0
                gamma_val_list.append(val_accuracy)

            try:
                max_idx = np.argmax(np.array(gamma_val_list))
                neigh = svm.SVC(kernel='linear')
                neigh.fit(embed_lab_list[idx,max_idx,:,:],y_lab)
                neigh_predict = neigh.predict(embed_test_list[idx,max_idx,:,:])
                neigh_predict_accuracy = accuracy_score(y_test, neigh_predict)
                temp_SVM_test_list.append(neigh_predict_accuracy)
            except:
                temp_SVM_test_list.append(0)

            idx+=1

        Knn_test_list.append(temp_Knn_test_list)
        SVM_test_list.append(temp_SVM_test_list)

    all_test_list = np.array([Knn_test_list, SVM_test_list])
    np.save('embeddings/all_test_list{}_{}_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),all_test_list)

    # return SVM_test_list,Knn_test_list

compute downstream for tasks, that do not have hyperparameters and therefore dont need valiation set (ReLU and linear)


In [None]:
def linear_models(rep,data_set_name,scale = False,embedding_size = 2):

    num_kernel = 2
    num_gamma = len(gamma_list)
    SVM_test_list = []
    Knn_test_list = []
    embed_test_list = np.load('embeddings/embed{}_{}_lin_test_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),allow_pickle=True)
    embed_lab_list = np.load('embeddings/embed{}_{}_lin_lab_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),allow_pickle=True)
    Y_test_list = np.load('embeddings/Y_test{}_{}_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),allow_pickle=True)
    Y_lab_list = np.load('embeddings/Y_lab{}_{}_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),allow_pickle=True)

    idx = 0
    for r in range(rep):
        y_lab = Y_lab_list[r,:]
        y_test = Y_test_list[r,:]
        temp_SVM_test_list = []
        temp_Knn_test_list = []

        for i in range(4*2):

            if scale:
                scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(embed_lab_list[idx,:,:])
                embed_test_list_ = scaling.transform(embed_test_list[idx,:,:])
                embed_lab_list_ = scaling.transform(embed_lab_list[idx,:,:])
            else:
                embed_test_list_ = embed_test_list[idx,:,:]
                embed_lab_list_ = embed_lab_list[idx,:,:]

            ######### KNN
            try:
                neigh = KNeighborsClassifier(n_neighbors=3)
                neigh.fit(embed_lab_list_,y_lab)
                neigh_predict = neigh.predict(embed_test_list_)
                neigh_predict_accuracy = accuracy_score(y_test, neigh_predict)
                temp_Knn_test_list.append(neigh_predict_accuracy)
            except:
                temp_Knn_test_list.append(0)

            ######### SVM
            try:
                neigh = svm.SVC(kernel='linear')
                neigh.fit(embed_lab_list_,y_lab)
                neigh_predict = neigh.predict(embed_test_list_)
                neigh_predict_accuracy = accuracy_score(y_test, neigh_predict)
                temp_SVM_test_list.append(neigh_predict_accuracy)
            except:
                temp_Knn_test_list.append(0)


            idx+=1
        Knn_test_list.append(temp_Knn_test_list)
        SVM_test_list.append(temp_SVM_test_list)

    all_test_list_lin = np.array([Knn_test_list, SVM_test_list])
    np.save('embeddings/all_test_list_lin{}_{}_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),all_test_list_lin)

    # return SVM_test_list,Knn_test_list

# Benchmark on original features

Knn and SVM on original data

In [None]:
def run_benchmark(X,Y,N_test,N_lab,rep,scale=False,embedding_size = 2):
    all_benchmark_list = []

    def ReLU_Kernel_PCA(x,y):
        inp = (x@y.T)/gamma
        np.clip(inp, -1, 1, out=inp)
        K_ij = inp*kappa_0(inp) + kappa_1(inp)
        return K_ij

    for r in range(rep):

        accuracy_list = []

        # get data splits
        X_train,X_lab,Y_lab,X_test,Y_test,X0,Xp,Xm = create_data_split(X,Y, N_lab,N_test,r,noise=0.1)

        if scale:
            scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X_lab)
            X_lab = scaling.transform(X_lab)
            X_test = scaling.transform(X_test)

        #####################################

        neigh = KNeighborsClassifier(n_neighbors=3)
        neigh.fit(X_lab, Y_lab)
        neigh_predict = neigh.predict(X_test)
        neigh_predict_accuracy = accuracy_score(Y_test, neigh_predict)

        accuracy_list.append(neigh_predict_accuracy)

        #####################################

        kernel_name_list = ['linear',ReLU_Kernel_PCA, 'rbf', sklearn.metrics.pairwise.laplacian_kernel]
        gamma = ReLU_Kernel_normalize(X_lab)
        for kernel in kernel_name_list:
            try:
                clf = svm.SVC(kernel=kernel)
                clf.fit(X_lab,Y_lab)
                SVM_predict = clf.predict(X_test)
                SVM_predict_accuracy = accuracy_score(Y_test, SVM_predict)
                accuracy_list.append(SVM_predict_accuracy)
            except:
                accuracy_list.append(0)

        all_benchmark_list.append(accuracy_list)

    np.save('embeddings/all_benchmark_list{}_{}_lin_test_list_{}.npy'.format(int(N_lab*100),embedding_size,data_set_name),all_benchmark_list)

    # return all_benchmark_list

function that combines the main functions to run for the plots in Figure 1 as well as for the appendix

In [None]:

def run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name):
    create_embedddings(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)
    choose_best_gamma(rep,gamma_list,data_set_name)
    linear_models(rep,data_set_name)
    all_benchmark_list = run_benchmark(X,Y,N_test,N_lab,rep)

# Run experiments

Un-comment whatever dataset to run all experiments for

## Set General Parameters

In [None]:
# data split
N_unlab = 0.5
N_lab = 0.05
N_test = 0.45

# hyperparameter list
gamma_list = np.logspace(-2, 2, base = 10,num=25)

# number of repetitions
rep = 5

## Circles

In [None]:
# N_unlab = 0.5
# N_lab = 0.1
# N_test = 0.4

# ### Get data

# n_samples = 200
# factor = 0.6
# data_set_name = 'circles_factor_{}'.format(int(factor*10))
# n_clusters = 2

# data = datasets.make_circles(n_samples=n_samples, factor=factor, noise=0.05, random_state=0)
# X = data[0]
# Y = data[1]

# scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
# X = scaling.transform(X)

# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)
# plot_accuracy(data_set_name,N_lab)
# run_NN(X,Y,N_test,N_lab,rep,scale=False,embedding_size = 2)


In [None]:
# rep = 5
# acc_list = []
# for cut in [0.01,0.5,1]:
#     all_acc_list = create_embedddings_change_num_data(X,Y,N_lab,N_test,rep,data_set_name,cut,scale = False,embedding_size=2)
#     print(all_acc_list)
#     acc_list.append(all_acc_list)

# np.save('embeddings/change_datanumber_{}.npy'.format(data_set_name),np.array(acc_list))



## Half Moons

In [None]:
# N_unlab = 0.5
# N_lab = 0.03
# N_test = 0.47

# ### Get data

# n_samples = 200
# data_set_name = 'moons'
# n_clusters = 2

# data = datasets.make_moons(n_samples=n_samples, noise=0.05)
# X = data[0]
# Y = data[1]

# scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
# X = scaling.transform(X)

# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)


In [None]:
# N_unlab = 0.5
# N_lab = 0.05
# N_test = 0.45

# ### Get data

# n_samples = 200
# data_set_name = 'moons'
# n_clusters = 2

# data = datasets.make_moons(n_samples=n_samples, noise=0.05)
# X = data[0]
# Y = data[1]

# scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
# X = scaling.transform(X)

# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)


## Blobs

In [None]:
# N_unlab = 0.5
# N_lab = 0.05
# N_test = 0.45

# ### Get data

# n_samples = 200
# data_set_name = 'blobs'
# n_clusters = 3

# data = datasets.make_blobs(n_samples=n_samples, random_state=8)
# X = data[0]
# Y = data[1]

# scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
# X = scaling.transform(X)

# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)


## Cubes

In [None]:
# N_unlab = 0.5
# N_lab = 0.05
# N_test = 0.45

# ### Get data

# n_samples = 200
# data_set_name = 'cubes'
# n_clusters = 2

# data = sklearn.datasets.make_classification(n_samples=200,n_classes=n_clusters)
# X = data[0]
# Y = data[1]

# scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
# X = scaling.transform(X)

# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)


## Iris

In [None]:
# N_unlab = 0.5
# N_lab = 0.05
# N_test = 0.45

# ### Get data

# data_set_name = 'iris'
# n_clusters = 3

# from sklearn.datasets import load_iris
# iris_data = load_iris()
# X = iris_data.data
# Y = iris_data.target

# scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
# X = scaling.transform(X)

# run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)


## wine

In [None]:
# N_unlab = 0.5
# N_lab = 0.05
# N_test = 0.45

# ### Get data


# data_set_name = 'wine'
# n_clusters = 3

# from sklearn.datasets import load_wine
# wine_data = load_wine()
# X = wine_data.data
# Y = wine_data.target

# scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
# X = scaling.transform(X)

# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)


## Brest cancer

In [None]:
# N_unlab = 0.5
# N_lab = 0.05
# N_test = 0.45

# ### Get data


# data_set_name = 'brest_cancer'
# n_clusters = 2

# from sklearn.datasets import load_breast_cancer
# cancer_data = load_breast_cancer()
# X = cancer_data.data
# Y = cancer_data.target

# scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
# X = scaling.transform(X)

# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)


## heart failure

https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic

In [None]:
# N_unlab = 0.5
# N_lab = 0.05
# N_test = 0.45

# ### Get data

# data_set_name = 'heart_failure_scale'
# n_clusters = 2

# hear_data_import = np.genfromtxt('small_data_file/heart_failure_clinical_records_dataset.csv', delimiter=',')
# X = hear_data_import[1:,:-1]
# Y = hear_data_import[1:,-1]


# scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
# X = scaling.transform(X)

# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)


## Ionosphere

In [None]:
# N_unlab = 0.5
# N_lab = 0.05
# N_test = 0.45
# ### Get data
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder


# data_set_name = 'ionosphere'
# n_clusters = 2

# # load dataset https://archive.ics.uci.edu/ml/datasets/ionosphere
# data = pd.read_csv("small_data_file/ionosphere.csv", header=None)
# col_names = list(data.columns)
# feature_names = list(col_names[:-1])
# target_names = ['bad', 'good']

# cols = list(data)
# cols.insert(0, cols.pop(cols.index(34)))
# data = data.loc[:, cols]
# data.columns = col_names

# X = np.float32(data.drop(0, axis=1).values)
# Y = data[0].values


# scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
# X = scaling.transform(X)

# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)


## Mushroom

In [None]:
# N_unlab = 0.5
# N_lab = 0.05
# N_test = 0.45

# ### Get data
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder


# data_set_name = 'Mushroom_sample'
# n_clusters = 2

# #https://github.com/thomberg1/SmallDatasetsAnalysis/blob/master/mushroom_ML_classifier.ipynb

# dataframe = pd.read_csv("small_data_file/agaricus-lepiota.csv", header=None)

# target_names = ['edible', 'poisonous']

# #Transform data form categorial to numeric
# cat_variables = [i for i in dataframe.columns if dataframe[i].dtype == 'object']
# encods = [LabelEncoder() for col in cat_variables]
# for i, col in enumerate(cat_variables):
#     dataframe[col] = encods[i].fit_transform(dataframe[col].astype(str))

# n = 200
# X = np.float32(dataframe.values[:,1:])
# print(X.shape)
# Y = np.int64(dataframe.values[:,0])
# X,Y = shuffle(X,Y,random_state=0)

# Y = Y[:n]
# X = X[:n,:]

# scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
# X = scaling.transform(X)

# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)


## MNIST

In [None]:
# N_unlab = 0.5
# N_lab = 0.05
# N_test = 0.45

# rep = 5

# ### Get data
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder
# from collections import defaultdict
# import torchvision

# data_set_name = 'mnist_sample_2'
# n_clusters = 2

# class_labels_list = [0,1]
# num_samples = 250

# data = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=None)
# print('MNIST loaded dataset', data)

# X_full = torch.tensor(data.test_data.reshape(data.test_data.shape[0],-1))
# Y_full = torch.tensor(data.test_labels)

# #pick classes in class_labels_list, num_samples from each
# x_idx = torch.tensor([])
# for i in class_labels_list:
#     x_idx = torch.cat((x_idx,(Y_full==i).nonzero(as_tuple=True)[0][:num_samples]))
# print('Number of total samples ', x_idx.shape)

# shuffle_idx = torch.randperm(x_idx.shape[0])
# x_idx = x_idx[shuffle_idx].long()
# X = torch.tensor(X_full[x_idx], dtype=torch.long).detach().cpu().numpy()
# Y = Y_full[x_idx].detach().cpu().numpy()


# scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
# X = scaling.transform(X)

# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)
# # run_NN(X,Y,N_test,N_lab,rep,scale=False,embedding_size = 2)


MNIST loaded dataset Dataset MNIST
    Number of datapoints: 10000
    Root location: ./data
    Split: Test
Number of total samples  torch.Size([500])


  X_full = torch.tensor(data.test_data.reshape(data.test_data.shape[0],-1))
  Y_full = torch.tensor(data.test_labels)
  X = torch.tensor(X_full[x_idx], dtype=torch.long).detach().cpu().numpy()


## CIFAR

In [None]:
# N_unlab = 0.5
# N_lab = 0.3
# N_test = 0.2

# rep = 2
# ### Get data
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder
# from collections import defaultdict
# import torchvision

# data_set_name = 'cifar_sample_2'
# n_clusters = 2

# class_labels_list = [0,1]
# num_samples = 250

# data = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=None)
# print('CIFAR10 loaded dataset', data)

# X_full = torch.tensor(data.data.reshape(data.data.shape[0],-1))
# Y_full = torch.tensor(data.targets)

# #pick classes in class_labels_list, num_samples from each
# x_idx = torch.tensor([])
# for i in class_labels_list:
#     x_idx = torch.cat((x_idx,(Y_full==i).nonzero(as_tuple=True)[0][:num_samples]))
# print('Number of total samples ', x_idx.shape)

# shuffle_idx = torch.randperm(x_idx.shape[0])
# x_idx = x_idx[shuffle_idx].long()
# X = torch.tensor(X_full[x_idx], dtype=torch.long).detach().cpu().numpy()
# Y = Y_full[x_idx].detach().cpu().numpy()

# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)

# create_embedddings(X,Y,N_lab,N_test,rep,gamma_list,data_set_name,scale = True,embedding_size= 10)
# choose_best_gamma(rep,gamma_list,data_set_name,scale = True)
# linear_models(rep,data_set_name,scale = True)
# all_benchmark_list = run_benchmark(X,Y,N_test,N_lab,rep)

# run_NN(X,Y,N_test,N_lab,rep,scale=True,embedding_size = 10)


In [None]:
# N_unlab = 0.5
# N_lab = 0.05
# N_test = 0.45


# ### Get data
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder
# from collections import defaultdict
# import torchvision

# data_set_name = 'cifar_sample_5'
# data_set_name = 'SVHN_sample_5'
# n_clusters = 5

# class_labels_list = [0,1,2,3,4,5]
# num_samples = 50

# data = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=None)
# print('CIFAR10 loaded dataset', data)

# X_full = torch.tensor(data.data.reshape(data.data.shape[0],-1))
# Y_full = torch.tensor(data.targets)

# #pick classes in class_labels_list, num_samples from each
# x_idx = torch.tensor([])
# for i in class_labels_list:
#     x_idx = torch.cat((x_idx,(Y_full==i).nonzero(as_tuple=True)[0][:num_samples]))
# print('Number of total samples ', x_idx.shape)

# shuffle_idx = torch.randperm(x_idx.shape[0])
# x_idx = x_idx[shuffle_idx].long()
# X = torch.tensor(X_full[x_idx], dtype=torch.long).detach().cpu().numpy()
# Y = Y_full[x_idx].detach().cpu().numpy()


# _ = run_de_noising_experiment(X,Y,noise = 0.1,N_lab=N_lab,N_test=N_test,rep=5,gamma_list=gamma_list,data_set_name=data_set_name,epochs = 500)

## SVHN

In [None]:
# N_unlab = 0.5
# N_lab = 0.05
# N_test = 0.45


# ### Get data
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder
# from collections import defaultdict
# import torchvision

# data_set_name = 'SVHN_sample_5'
# n_clusters = 5

# class_labels_list = [0,1,2,3,4,5]
# num_samples = 50

# data = torchvision.datasets.SVHN(root='./data',download=True)
# print('SVHN loaded dataset', data)

# X_full = torch.tensor(data.data.reshape(data.data.shape[0],-1))
# Y_full = torch.tensor(data.labels)

# #pick classes in class_labels_list, num_samples from each
# x_idx = torch.tensor([])
# for i in class_labels_list:
#     x_idx = torch.cat((x_idx,(Y_full==i).nonzero(as_tuple=True)[0][:num_samples]))
# print('Number of total samples ', x_idx.shape)

# shuffle_idx = torch.randperm(x_idx.shape[0])
# x_idx = x_idx[shuffle_idx].long()
# X = torch.tensor(X_full[x_idx], dtype=torch.long).detach().cpu().numpy()
# Y = Y_full[x_idx].detach().cpu().numpy()

# all_mse_list = run_de_noising_experiment(X,Y,noise = 0.1,N_lab=N_lab,N_test=N_test,rep=5,gamma_list=gamma_list,data_set_name=data_set_name,epochs = 500)

In [None]:
# N_unlab = 0.5
# N_lab = 0.05
# N_test = 0.45


# ### Get data
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder
# from collections import defaultdict
# import torchvision

# data_set_name = 'SVHN_sample_2'
# n_clusters = 2

# class_labels_list = [0,1]
# num_samples = 250

# data = torchvision.datasets.SVHN(root='./data',download=True)
# print('SVHN loaded dataset', data)

# X_full = torch.tensor(data.data.reshape(data.data.shape[0],-1))
# Y_full = torch.tensor(data.labels)

# #pick classes in class_labels_list, num_samples from each
# x_idx = torch.tensor([])
# for i in class_labels_list:
#     x_idx = torch.cat((x_idx,(Y_full==i).nonzero(as_tuple=True)[0][:num_samples]))
# print('Number of total samples ', x_idx.shape)

# shuffle_idx = torch.randperm(x_idx.shape[0])
# x_idx = x_idx[shuffle_idx].long()
# X = torch.tensor(X_full[x_idx], dtype=torch.long).detach().cpu().numpy()
# Y = Y_full[x_idx].detach().cpu().numpy()



# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)

# # create_embedddings(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)
# choose_best_gamma(rep,gamma_list,data_set_name,scale = True)
# linear_models(rep,data_set_name)
# all_benchmark_list = run_benchmark(X,Y,N_test,N_lab,rep)

# run_NN(X,Y,N_test,N_lab,rep,scale=True,embedding_size = 2)

## Parkinson’s Disease Classification

https://github.com/imadtoubal/Parkinson-s-Disease-Classification-from-Speech-Data

In [None]:
# N_unlab = 0.5
# N_lab = 0.1
# N_test = 0.4

# # ### Get data


# data_set_name = 'pd_speech'
# n_clusters = 2

# df = pd.read_csv('small_data_file/pd_speech_features.txt')
# df.drop(['id'], 1, inplace=True)
# X = np.array(df.drop(['class'], 1))

# print(X.shape)
# Y = np.array(df['class'])

# scaling = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)).fit(X)
# X = scaling.transform(X)

# # run_experiments(X,Y,N_lab,N_test,rep,gamma_list,data_set_name)
