### Experiments for Prokhorov Distances

In [12]:
%matplotlib widget
import gudhi as gd
from gudhi import hera
from gudhi import representations
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import h5py
import sklearn
from sklearn import manifold
from sklearn_extra.cluster import KMedoids
from sklearn.svm import SVC
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors       import KNeighborsClassifier
from tqdm.notebook import tqdm
import scipy.optimize as spo 
from scipy.optimize import minimize
import kmedoids

import os
import math
import time

Some functions to load data

In [7]:

def diag_to_array(data):
    dataset, num_diag = [], len(data["0"].keys())
    for dim in data.keys():
        X = []
        for diag in range(num_diag):
            pers_diag = np.array(data[dim][str(diag)])
            X.append(pers_diag)
        dataset.append(X)
    return dataset

def diag_to_dict(D):
    X = dict()
    for f in D.keys():
        df = diag_to_array(D[f])
        for dim in range(len(df)):
            X[str(dim) + "_" + f] = df[dim]
    return X 

def load_PI_data(dim = "1", noise = "1"):
    path = "../PersistenceImages/matlab_code/sixShapeClasses/ToyData_PD_TextFiles/"
    PDs = {}
    for entry in os.scandir(path):
        if(entry.name.endswith(".txt")):
            PDs[entry.name] = {"noise":entry.name.split(sep="_")[2],
                               "index":entry.name.split(sep="_")[3],
                               "shape_class":entry.name.split(sep="_")[4],
                               "dimension":entry.name.split(sep="_")[5],
                               "data":np.loadtxt(entry.path)}
    persistence_list = [pd["data"] for pd in PDs.values() if(pd["noise"]=="n{}".format(noise) and pd["dimension"]=="{}.txt".format(dim))]
    labels = [pd["shape_class"] for pd in PDs.values() if(pd["noise"]=="n{}".format(noise) and pd["dimension"]=="{}.txt".format(dim))]
    return persistence_list, labels

def load_3d_data():
    path = "../sklearn-tda/example/3DSeg/"
    train_lab  = pd.read_csv(path+"train.csv")
    train_diag = diag_to_dict(h5py.File(path+"train_diag.hdf5", "r"))
    return train_diag, train_lab

def load_cubical_data(filt="upper", d=1):
    f= h5py.File('cubical_data_{}_{}d.hdf5'.format(filt,d), "r")["list_of_lists"]
    X = []
    for i in range(0,len(f)):
        X.append(np.array(f[str(i)]))
    y = np.loadtxt('cubical_labels_{}_{}d.txt'.format(filt,d))
    return X,y

In [8]:
X,y = load_cubical_data()

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

## K-neighbors

In [14]:
knn = KNeighborsClassifier(n_neighbors=5, metric = "precomputed")

Perform a grid search with cross-validation among integer slopes for the Prokhorov parameter function 

In [16]:
list_of_params = range(10,100)
best_param = 0
best_score = 0

tic = time.time()
for p in tqdm(list_of_params):
    PD = gd.representations.ProkhorovDistance(n_jobs = -1, coefs = np.array([0,p]))
    PD.fit(X_train)
    P_train = PD.transform(X_train)
    cv_scores = cross_val_score(knn, P_train, y_train, cv=5)
    if np.mean(cv_scores)>best_score:
        best_param = p
        best_score = np.mean(cv_scores)
        
print("grid search completed in {}s, best parameter is {} with score {}".format(time.time()-tic, best_param, best_score)) 

  0%|          | 0/90 [00:00<?, ?it/s]

KeyboardInterrupt: 