### Experiments for Prokhorov Distances

In [1]:
%matplotlib widget
import gudhi as gd
from gudhi import hera
from gudhi import representations
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import h5py
import sklearn
from sklearn import manifold
from sklearn_extra.cluster import KMedoids
from sklearn.svm import SVC
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.model_selection import ShuffleSplit
from tqdm.notebook import tqdm
import scipy.optimize as spo 
from scipy.optimize import minimize
import kmedoids
from concurrent.futures import ProcessPoolExecutor
import itertools
from utils import run_single_fold


import os
import math
import time

Some functions to load data

In [2]:

def diag_to_array(data):
    dataset, num_diag = [], len(data["0"].keys())
    for dim in data.keys():
        X = []
        for diag in range(num_diag):
            pers_diag = np.array(data[dim][str(diag)])
            X.append(pers_diag)
        dataset.append(X)
    return dataset

def diag_to_dict(D):
    X = dict()
    for f in D.keys():
        df = diag_to_array(D[f])
        for dim in range(len(df)):
            X[str(dim) + "_" + f] = df[dim]
    return X 

def load_PI_data(dim = "1", noise = "1"):
    path = "../PersistenceImages/matlab_code/sixShapeClasses/ToyData_PD_TextFiles/"
    PDs = {}
    for entry in os.scandir(path):
        if(entry.name.endswith(".txt")):
            PDs[entry.name] = {"noise":entry.name.split(sep="_")[2],
                               "index":entry.name.split(sep="_")[3],
                               "shape_class":entry.name.split(sep="_")[4],
                               "dimension":entry.name.split(sep="_")[5],
                               "data":np.loadtxt(entry.path)}
    persistence_list = np.array([pd["data"] for pd in PDs.values() if(pd["noise"]=="n{}".format(noise) and pd["dimension"]=="{}.txt".format(dim))])
    labels = np.array([int(pd["shape_class"]) for pd in PDs.values() if(pd["noise"]=="n{}".format(noise) and pd["dimension"]=="{}.txt".format(dim))], dtype = int)
    return persistence_list, labels

def load_3d_data():
    path = "../sklearn-tda/example/3DSeg/"
    train_lab  = pd.read_csv(path+"train.csv")
    train_diag = diag_to_dict(h5py.File(path+"train_diag.hdf5", "r"))
    return np.array(train_diag["1_geodesic"]), np.array(train_lab["part"], dtype=int)

def load_cubical_data(filt="upper", d=1):
    f= h5py.File('cubical_data_{}_{}d.hdf5'.format(filt,d), "r")["list_of_lists"]
    X = []
    for i in range(0,len(f)):
        X.append(np.array(f[str(i)]))
    y = np.loadtxt('cubical_labels_{}_{}d.txt'.format(filt,d), dtype = int)
    return np.array(X),y

In [42]:
#X,y = load_PI_data(dim="0", noise="1")
#X,y = load_cubical_data(filt = "lower", d=0)
X,y = load_3d_data()

  return np.array(train_diag["1_geodesic"]), np.array(train_lab["part"], dtype=int)


In [43]:
X.shape

(5700,)

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

## K-neighbors

In [45]:
knn = KNeighborsClassifier(n_neighbors=5, metric = "precomputed")

Perform a grid search with cross-validation among integer slopes for the Prokhorov parameter function 

In [46]:
list_of_params = range(10,100)
best_param = 0
best_score = 0

tic = time.time()
for p in tqdm(list_of_params):
    PD = gd.representations.ProkhorovDistance(n_jobs = -1, coefs = np.array([0,p]))
    PD.fit(X_train)
    P_train = PD.transform(X_train)
    cv_scores = cross_val_score(knn, P_train, y_train, cv=5)
    if np.mean(cv_scores)>best_score:
        best_param = p
        best_score = np.mean(cv_scores)
        
print("grid search completed in {}s, best parameter is {} with score {}".format(time.time()-tic, best_param, best_score)) 

  0%|          | 0/90 [00:00<?, ?it/s]

grid search completed in 40440.76908326149s, best parameter is 10 with score 0.9101754385964913


In [47]:
reps = [gd.representations.ProkhorovDistance(n_jobs=-1, coefs = np.array([0,best_param])),
        gd.representations.BottleneckDistance(n_jobs=-1),
        gd.representations.WassersteinDistance(n_jobs=-1, order=1, internal_p =1),
        gd.representations.WassersteinDistance(n_jobs=-1, order=2, internal_p =2)]

for r in reps:
    print("------------------"+str(r)+"------------------")
    tic = time.time()
    r.fit(X_train)
    D_matrix_train = r.transform(X_train)
    D_matrix_test = r.transform(X_test)
    print("distance computation done in ", time.time()-tic)
    cv_scores = cross_val_score(knn, D_matrix_train, y_train, cv=5)
    print("training accuracy: ",np.mean(cv_scores))
    knn.fit(D_matrix_train, y_train)
    score = knn.score(D_matrix_test, y_test)
    print("test accuracy: ",score)

------------------ProkhorovDistance(coefs=array([ 0, 10]), n_jobs=-1)------------------
distance computation done in  795.2427561283112
training accuracy:  0.9101754385964913
test accuracy:  0.9270175438596491
------------------BottleneckDistance(n_jobs=-1)------------------
distance computation done in  1252.453706741333
training accuracy:  0.9098245614035088
test accuracy:  0.9245614035087719
------------------WassersteinDistance(internal_p=1, n_jobs=-1)------------------
distance computation done in  838.5659482479095
training accuracy:  0.9042105263157895
test accuracy:  0.9312280701754386
------------------WassersteinDistance(internal_p=2, n_jobs=-1, order=2)------------------
distance computation done in  1740.7613213062286
training accuracy:  0.9059649122807019
test accuracy:  0.9298245614035088


## K-Medoids

In [48]:
km = kmedoids.KMedoids(n_clusters = len(np.unique(y)), metric="precomputed", init = "build", random_state = 42)

In [49]:
def retrieve_info(cluster_labels,y_train):
    # Initializing
    reference_labels = {}
    # For loop to run through each label of cluster label
    for i in range(len(np.unique(cluster_labels))):
        index = np.where(cluster_labels == i,1,0)
        num = np.bincount(y_train[index==1]).argmax()
        reference_labels[i] = num
    return reference_labels

def km_cv(km, rep, X, y, cv):
    rs = ShuffleSplit(n_splits=cv, test_size=1.0/cv, random_state=0)
    scores = []
    for train_index, test_index in rs.split(X):
        train_index = np.array(train_index)
        test_index = np.array(test_index)
        X_train_cv = X[train_index]
        y_train_cv = y[train_index]
        X_test_cv = X[test_index]
        y_test_cv = y[test_index]
        
        rep.fit(X_train_cv)
        km.fit(rep.transform(X_train_cv))
        label_dictionary = retrieve_info(km.labels_, y_train_cv)
        
        predicted_clusters = km.predict(rep.transform(X_test_cv))
        predicted_labels = [label_dictionary[predicted_clusters[i]] for i in range(0,len(predicted_clusters))]
            
        scores.append(accuracy_score(predicted_labels,y_test_cv))
    
    
    return scores

def km_cv_parallel(km, rep, X, y, cv):
    skf = StratifiedKFold(n_splits=cv, random_state=666, shuffle=True)
    rep.fit(X)
    D_matrix=rep.transform(X)
    
    NUM_WORKERS = None # if none uses all cores
    with ProcessPoolExecutor(max_workers=NUM_WORKERS) as executor:
        train_acc_log, valid_acc_log = zip(*executor.map(run_single_fold, 
                                                         itertools.repeat(D_matrix),
                                                         itertools.repeat(y),
                                                         [train_index for train_index, _ in skf.split(X, y)],
                                                         [valid_index for _, valid_index in skf.split(X, y)]))
        
    return valid_acc_log

In [50]:
list_of_params = range(10,100)
best_param = 0
best_score = 0

tic = time.time()
for p in tqdm(list_of_params):
    PD = gd.representations.ProkhorovDistance(n_jobs = -1, coefs = np.array([0,p]))

    cv_scores = km_cv_parallel(km, PD, X_train, y_train, 5)
    if np.mean(cv_scores)>best_score:
        best_param = p
        best_score = np.mean(cv_scores)
        
print("grid search completed in {}s, best parameter is {} with score {}".format(time.time()-tic, best_param, best_score)) 

  0%|          | 0/90 [00:00<?, ?it/s]

grid search completed in 41715.83107829094s, best parameter is 13 with score 0.46842105263157896


In [51]:
reps = [gd.representations.ProkhorovDistance(n_jobs=-1, coefs = np.array([0,best_param])),
        gd.representations.BottleneckDistance(n_jobs=-1),
        gd.representations.WassersteinDistance(n_jobs=-1, order=1, internal_p =1),
        gd.representations.WassersteinDistance(n_jobs=-1, order=2, internal_p =2)]

for r in reps:
    print("------------------"+str(r)+"------------------")
    tic = time.time()
    r.fit(X_train)
    D_matrix_train = r.transform(X_train)
    r.fit(np.concatenate((X_train,X_test),axis=0))
    D_matrix_test = r.transform(np.concatenate((X_train,X_test),axis=0))
    print("distance computation done in ", time.time()-tic)
    #cv_scores = km_cv_parallel(km, r, X_train, y_train, 5)
    #print("training accuracy: ",np.mean(cv_scores))
   

    km = kmedoids.KMedoids(n_clusters = len(np.unique(y)), metric="precomputed", init = "build", random_state = 42) # why init = 'build' ??
    km.fit(D_matrix_train)
    km.dict_ = {i: y_train[id] for i, id in enumerate(km.medoid_indices_)}

    predicted_train_labels = [km.dict_[id] for id in km.predict(D_matrix_train)]
    train_acc = accuracy_score(predicted_train_labels, y_train)
    print("training accuracy: ",train_acc)
    
    predicted_valid_labels = [km.dict_[id] for id in km.predict(D_matrix_test)][len(y_train):] # we only look at the labels for the new data
    valid_acc = accuracy_score(predicted_valid_labels, y_test) 
    print("test accuracy: ",valid_acc)

------------------ProkhorovDistance(coefs=array([ 0, 13]), n_jobs=-1)------------------
distance computation done in  1946.679748058319
training accuracy:  0.4792982456140351
test accuracy:  0.49859649122807015
------------------BottleneckDistance(n_jobs=-1)------------------
distance computation done in  3467.9827699661255
training accuracy:  0.4905263157894737
test accuracy:  0.48912280701754385
------------------WassersteinDistance(internal_p=1, n_jobs=-1)------------------
distance computation done in  2057.5558784008026
training accuracy:  0.40210526315789474
test accuracy:  0.4126315789473684
------------------WassersteinDistance(internal_p=2, n_jobs=-1, order=2)------------------
distance computation done in  5009.361773967743
training accuracy:  0.4592982456140351
test accuracy:  0.45649122807017545
