In [1]:
# Maths and data management
import math
from math import pi
import numpy as np
import scipy as sp
import pandas as pd
from numba import jit
import pickle
import time
from tqdm.auto import tqdm

# PGMC
from pgmc import KPGMC, PGMC

# Plotting packages
import matplotlib.pyplot as plt
import seaborn as sns
# Plotting configuration
%matplotlib inline
sns.set()
sns.set_context("poster")
sns.set_style("ticks")

# ML toolkit
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
from sklearn.multiclass import OneVsOneClassifier,OneVsRestClassifier

# Datasets

In [2]:
def get_iris():
    """
    Load Iris with some restrictions.
    """
    iris = datasets.load_iris()
    X_ = iris.data[:,:2]
    Y_ = iris.target
    X = np.array([X_[i]+np.array([0,3.3]) for i in range(len(X_)) if Y_[i] != 2])
    y = np.array([1 if i==0 else 0 for i in Y_ if i != 2])
    size_max = max([np.sqrt(sum(x**2)) for x in X])
    X = np.array([[i/size_max for i in x] for x in X],dtype=complex)
    
    u = sum(X)/len(X)
    X = X-u + np.array([0.013,0.])
    return X,y

def get_mnist(features=5, nb_classes=2, path="mnist.pkl"):
    """
    Load MNIST, eliminate some classes and apply pca.
    """
    pca = PCA(n_components=features)
    (X,y) = pickle.load(open(path,"rb"))
    X = np.array([X[i] for i in range(len(X)) if y[i] in list(range(nb_classes))])
    y = np.array([y[i] for i in range(len(y)) if y[i] in list(range(nb_classes))])
    if len(X[0]) > features:
        X = pca.fit_transform(X)
    return X,y

# Tools

In [3]:
def metrics(y_true,y_pred,average="binary",silent=False):
    """
    Compute and return a lot of metrics given prediction and ground truths.
    """
    if len(list(set(y_pred))) > 2 and average == "binary":
        average = "micro"
    accuracy = sk.metrics.accuracy_score(y_true,y_pred)
    precision = sk.metrics.precision_score(y_true,y_pred,average=average,zero_division=0)
    recall = sk.metrics.recall_score(y_true,y_pred,average=average,zero_division=0)
    ba = sk.metrics.balanced_accuracy_score(y_true, y_pred)
    mse = sk.metrics.mean_squared_error(y_true, y_pred)
    confusion = sk.metrics.confusion_matrix(y_true,y_pred)
    fmeas = sk.metrics.f1_score(y_true,y_pred,average=average,zero_division=0)
    if not silent:
        print("Accuracy : ",accuracy)
        print("Precision : ",precision)
        print("Recall : ",recall)
        print("BA : ",ba)
        print("MSE : ",mse)
        print("F-measure : ",fmeas)
        print("Confusion matrix : \n",confusion)
    return [accuracy,precision,recall,ba,mse,fmeas,confusion]


def imbalance(X,y,ratio):
    """
    Articificially imbalance a dataset.
    """
    X0 = np.array([[X[i],y[i]] for i in range(len(y)) if y[i] == 0],dtype=object)
    X1 = np.array([[X[i],y[i]] for i in range(len(y)) if y[i] == 1],dtype=object)
    current = len(X1)/(len(X0)+len(X1))
    if current > ratio:
        desired_len = int(len(X0)*ratio/(1-ratio))
        c1 = np.random.choice(list(range(len(X1))),desired_len,replace=False)
        X1 = X1[c1]
    else:
        desired_len = int(len(X1)*(1-ratio)/ratio)
        c0 = np.random.choice(list(range(len(X0))),desired_len,replace=False)
        X0 = X0[c0]
    data = np.concatenate([X0,X1])
    X_ = np.array(list(data[:,0]))
    y_ = np.array(list(data[:,1]))
    return X_,y_

# Task manager
The task manager will run all the classifier provided on all the datasets provided with k-fold cross validation.

In [4]:
class Task:
    def __init__(self, repeat=5):
        self.datasets = []
        self.clf = []
        self.repeat=repeat

    def todo(self):
        ret = []
        for data_name,X,y in self.datasets:
            for name_clf,clf in self.clf:
                ret.append([data_name,X,y,name_clf,clf,self.repeat])
        return ret

    def add_data(self,name,X,y):
        self.datasets.append((name,X,y))

    def add_clf(self,name,clf):
        self.clf.append((name,clf))

    def run_aux(self,clf,X_train,y_train,X_test,y_test):
        T = time.time()
        clf.fit(X_train,y_train)
        T = time.time() - T
        y_pred = clf.predict(X_test)
        l = metrics(y_test,y_pred,silent=True,average="binary" if len(set(y_test))==2 else "micro")
        return [len(y_train),len(X_train[0])]+l[0:6]+[T]

    def run(self):
        res = []
        for name_data,X,y,name_clf,clf,repeat in tqdm(self.todo()):
                rs = sk.model_selection.ShuffleSplit(n_splits=repeat, test_size=0.3, random_state=0)
                for i, (train_index, test_index) in tqdm(enumerate(rs.split(X)),total=repeat,desc=f"{repeat}-fold CrossValidation of {name_clf} on {name_data}",leave=False):
                    res.append(self.run_aux(clf(),X[train_index],y[train_index],X[test_index],y[test_index])+[name_clf,name_data])
        return pd.DataFrame(res,columns=["size","features","acc","precision","recall","ba","mse","f1","time","clf","data"])

# Example 1 : Running KPGMC with RBF kernel on MNIST-1D
Here we make the classification on MNIST-1D with a rbf kernel. There is three variants : 
    + one that uses normalization embedding
    + one that uses orthogonal embedding
    + one that uses normalization embedding and a one vs one strategy for the multi-class classification

In [5]:
@jit
def rbf_kernel(x,y):
    return np.exp(-np.linalg.norm(x-y)**2)

task = Task(repeat=5) # 5-fold crossvalidation

## DATASETS
X,y = get_mnist(40,10,path="../mnist1d.pkl")
X,y = X[::10],y[::10]
task.add_data("MNIST-1D 0.5|0.5",X,y)

## CLASSIFIERS
task.add_clf("KPGMC rbf",lambda :KPGMC(kernel=rbf_kernel,class_weight_method="optimize"))
task.add_clf("KPGMC rbf one vs one",lambda :OneVsOneClassifier(KPGMC(kernel=rbf_kernel,class_weight_method="optimize")))
task.add_clf("KPGMC rbf orthogonal",lambda :KPGMC(kernel=rbf_kernel,embedding="orthogonal",class_weight_method="optimize"))

data_small = task.run()

  0%|          | 0/3 [00:00<?, ?it/s]

5-fold CrossValidation of KPGMC rbf on MNIST-1D 0.5|0.5:   0%|          | 0/5 [00:00<?, ?it/s]

5-fold CrossValidation of KPGMC rbf one vs one on MNIST-1D 0.5|0.5:   0%|          | 0/5 [00:00<?, ?it/s]

5-fold CrossValidation of KPGMC rbf orthogonal on MNIST-1D 0.5|0.5:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
data_small.groupby(["data","clf"]).mean(numeric_only=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,features,acc,precision,recall,ba,mse,f1,time
data,clf,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MNIST-1D 0.5|0.5,KPGMC rbf,350.0,40.0,0.362667,0.362667,0.362667,0.358632,10.801333,0.362667,0.439384
MNIST-1D 0.5|0.5,KPGMC rbf one vs one,350.0,40.0,0.34,0.34,0.34,0.338835,12.753333,0.34,2.51376
MNIST-1D 0.5|0.5,KPGMC rbf orthogonal,350.0,40.0,0.346667,0.346667,0.346667,0.345198,11.088,0.346667,0.544744


In [7]:
plt.figure(figsize=(60,6))
plt.suptitle('MNIST-1D 0.5|0.5', fontsize=50)
sns.barplot(data=data[data["data"]=="MNIST-1D 0.5|0.5"], x="clf", y="acc")

NameError: name 'data' is not defined

<Figure size 4320x432 with 0 Axes>

# Full classification task

In [None]:
@jit
def rbf_kernel(x,y):
    return np.exp(-np.linalg.norm(x-y)**2)

@jit
def exp_kernel(x,y):
    return np.exp(((np.dot(x,y)+1)/2)*100)

@jit
def gauss_kernel(x,y):
    return np.exp(np.dot(x,y)**2)



task = Task(repeat=5)

## DATASETS
X,y = get_mnist(40,10,path="../mnist1d.pkl")
X,y = X[::7],y[::7]
task.add_data("MNIST-1D 0.5|0.5",X,y)

X,y = imbalance(*get_mnist(40,2,path="../mnist1d.pkl"),0.25) # i features 2 classes
task.add_data("MNIST-1D 0.75|0.25",X,y)

X,y = imbalance(*get_mnist(40,2,path="../mnist1d.pkl"),0.1) # i features 2 classes
task.add_data("MNIST-1D 0.9|0.1",X,y)

## CLASSIFIERS
task.add_clf("SVM linear",lambda :SVC(kernel="linear"))
task.add_clf("SVM linear balanced",lambda :SVC(kernel="linear",class_weight="balanced"))
task.add_clf("SVM rbf",lambda :SVC())
task.add_clf("SVM rbf balanced",lambda :SVC(class_weight="balanced"))
task.add_clf("Tree",lambda :DecisionTreeClassifier())
task.add_clf("KPGMC ortho",lambda :KPGMC(embedding="orthogonal",class_weight_method="optimize"))
task.add_clf("KPGMC ortho auto",lambda :KPGMC(embedding="orthogonal",class_weight_method="auto"))
task.add_clf("KPGMC ortho one vs one",lambda :OneVsOneClassifier(KPGMC(embedding="orthogonal",class_weight_method="optimize")))
task.add_clf("KPGMC exp",lambda :KPGMC(kernel=exp_kernel, class_weight_method="optimize"))
task.add_clf("KPGMC gauss",lambda :KPGMC(kernel=gauss_kernel,class_weight_method="optimize"))
task.add_clf("KPGMC rbf",lambda :KPGMC(kernel=rbf_kernel,class_weight_method="optimize"))
task.add_clf("KPGMC rbf auto",lambda :KPGMC(kernel=rbf_kernel,class_weight_method="auto"))
task.add_clf("PGMC normal",lambda :PGMC(embedding="normal",class_weight_method="auto"))
task.add_clf("PGMC ortho",lambda :PGMC(embedding="orthogonal",class_weight_method="auto", device="cpu"))
task.add_clf("PGMC stereo",lambda :PGMC(embedding="stereo",class_weight_method="auto"))
task.add_clf("PGMC ortho one vs one",lambda :OneVsOneClassifier(PGMC(embedding="orthogonal",class_weight_method="auto", device="cpu")))

data = task.run()

In [None]:
data.groupby(["data","clf"]).mean(numeric_only=False)

In [None]:
for dataset in list(set(list(data["data"]))):
    plt.figure(figsize=(60,6))
    plt.suptitle(f"{dataset}", fontsize=50)
    sns.barplot(data=data[data["data"]==dataset], x="clf", y="ba")