In [1]:
%matplotlib widget
import gudhi as gd
from gudhi import hera
from gudhi import representations
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import h5py
import sklearn
from sklearn import manifold
from sklearn_extra.cluster import KMedoids
from sklearn.svm import SVC
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import scipy.optimize as spo 
from scipy.optimize import minimize
import kmedoids
import networkx as nx
from networkx import bipartite
import persim

import os
import math
import time

In [2]:

def diag_to_array(data):
    dataset, num_diag = [], len(data["0"].keys())
    for dim in data.keys():
        X = []
        for diag in range(num_diag):
            pers_diag = np.array(data[dim][str(diag)])
            X.append(pers_diag)
        dataset.append(X)
    return dataset

def diag_to_dict(D):
    X = dict()
    for f in D.keys():
        df = diag_to_array(D[f])
        for dim in range(len(df)):
            X[str(dim) + "_" + f] = df[dim]
    return X 

In [3]:
path = "../sklearn-tda/example/3DSeg/"
train_lab  = pd.read_csv(path+"train.csv")
train_diag = diag_to_dict(h5py.File(path+"train_diag.hdf5", "r"))

In [4]:
from sklearn.preprocessing import LabelEncoder

# Size of test set
test_size = 0.95

# Shuffle dataset and pick points for test set
train_num_pts        = train_lab.shape[0]    
perm                 = np.random.RandomState(seed=42).permutation(train_num_pts)
limit                = np.int(test_size * train_num_pts)
test_sub, train_sub  = perm[:limit], perm[limit:]

# Create train and test labels with LabelEncoder from scikit-learn
train_full_labels  = train_lab["part"]
le                 = LabelEncoder()
train_labels       = np.array(le.fit_transform(train_full_labels[train_sub]))
test_labels        = np.array(le.transform(train_full_labels[test_sub]))

# Create train and test sets of persistence diagrams
train_full_diag    = train_diag["1_geodesic"]
train_diag         = [train_full_diag[i] for i in train_sub]
test_diag          = [train_full_diag[i] for i in test_sub]

# Print sizes
train_num_pts, test_num_pts = len(train_sub), len(test_sub)
print("Number of train points = " + str(train_num_pts))
print("Number of test  points = " + str(test_num_pts))
print(perm)

Number of train points = 285
Number of test  points = 5415
[1436  748 4596 ... 5226 5390  860]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  limit                = np.int(test_size * train_num_pts)


In [5]:
train_labels

array([3, 0, 3, 1, 0, 1, 3, 0, 0, 0, 0, 0, 0, 3, 0, 3, 1, 0, 1, 0, 0, 0,
       0, 0, 3, 0, 1, 0, 0, 0, 1, 3, 1, 0, 0, 1, 1, 1, 2, 1, 0, 1, 0, 1,
       0, 0, 0, 2, 3, 1, 0, 1, 1, 0, 1, 0, 0, 0, 2, 0, 1, 3, 1, 3, 3, 0,
       0, 0, 0, 1, 3, 1, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1, 1, 0, 3, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 3, 0, 1, 1, 3, 0, 2, 1, 0, 3, 0,
       1, 3, 0, 0, 0, 3, 0, 1, 2, 1, 3, 0, 0, 1, 0, 0, 2, 1, 1, 1, 1, 3,
       0, 1, 2, 3, 0, 1, 0, 2, 0, 0, 2, 0, 3, 1, 1, 0, 1, 0, 0, 2, 1, 3,
       1, 2, 2, 0, 0, 0, 0, 0, 1, 3, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 2, 0,
       1, 0, 3, 2, 2, 1, 3, 1, 0, 0, 1, 1, 1, 0, 0, 3, 3, 0, 1, 3, 3, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 0, 2, 3, 3, 0, 0, 3, 1, 1,
       0, 1, 0, 0, 0, 1, 3, 1, 1, 1, 3, 0, 1, 0, 1, 3, 1, 1, 0, 3, 1, 0,
       1, 0, 1, 0, 1, 1, 2, 1, 0, 2, 1, 1, 3, 3, 1, 1, 0, 3, 1, 0, 1, 0,
       0, 0, 1, 3, 0, 0, 1, 0, 3, 3, 3, 3, 1, 1, 1, 1, 0, 1, 3, 0, 1])

In [6]:
D0=[(0,tuple(pt)) for pt in train_diag[0]]
D1=[(1,tuple(pt)) for pt in train_diag[2]]
D01 = D0+D1

In [7]:
f, (ax1,ax2) = plt.subplots(1,2, figsize = (15,10))
gd.plot_persistence_barcode(D01, axes=ax1)
#gd.plot_persistence_diagram(D01, axes=ax2)
db, (m, D) = persim.bottleneck(train_diag[0],train_diag[2], matching=True)
persim.wasserstein_matching(train_diag[0],train_diag[2], m, ax=ax2)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [8]:
def cospan_from_matching(X, Y, m):
    D1 = X
    D2 = Y
    for y in Y:
        D1 = np.append(D1, np.array([[(y[0]+y[1])/2,(y[0]+y[1])/2]]), axis=0)
    for x in X:
        D2 = np.append(D2, np.array([[(x[0]+x[1])/2,(x[0]+x[1])/2]]), axis=0)
    Clist = []
    for i,j in m:
        Clist.append([min(D1[i][0],D2[j][0]),min(D1[i][1],D2[j][1])])
    C = np.array(Clist)
    return C

In [9]:
f,ax = plt.subplots(1,1)
A = train_diag[0]
B = train_diag[1]
db, (m, D) = persim.bottleneck(A,B, matching=True)
C=cospan_from_matching(A, B, m)
persim.plot_diagrams([A, B, C])
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [10]:
def D(r_vals, t):
    return np.min(np.where(np.array(r_vals)<=t))

In [13]:
#r_vals = []
n_points=len(train_diag[0])+len(train_diag[1])
#compute the D function
d_vals = np.linspace(0,n_points,n_points+1)
r_vals = [gd.prokhorov_distance(train_diag[0], train_diag[1], np.array([r])) for r in d_vals]
f,ax = plt.subplots(1,1)
for i in range(1,n_points+1):
    ax.plot((r_vals[i],r_vals[i-1]),(i, i), c='tab:orange', ls='-', linewidth = 1.0, label = '$D_{X,Y}$')
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [15]:
print(len(train_diag[0]))
print(len(train_diag[1]))

10
7


In [15]:
def prokhorov(I1, I2, matching=False):
    g = nx.Graph()
    #add nodes for points
    g.add_nodes_from(list(zip(range(0,len(I1)),[0]*len(I1))), bipartite=0)
    g.add_nodes_from(list(zip(range(0,len(I2)),[1]*len(I2))), bipartite=1)
    #add all edges with cost = max.norm
    for v in [n for n, d in g.nodes(data=True) if d['bipartite']==0]:
        for w in [n for n, d in g.nodes(data=True) if d['bipartite']==1]:
            cost = max(abs(I1[v[0],0]-I2[w[0],0]),abs(I1[v[0],1]-I2[w[0],1]))#/2#*(1-I1[v[0],1]+I1[v[0],0])*(1-I2[w[0],1]+I2[w[0],0])
            #print(cost)
            g.add_edge(v,w, weight=cost)#nudge towards matching off-diagonal points?

    #add points on the diagonal
    g.add_nodes_from(list(zip(range(len(I1),len(I1)+len(I2)),[0]*len(I2))), bipartite=0)
    g.add_nodes_from(list(zip(range(len(I2),len(I2)+len(I1)),[1]*len(I1))), bipartite=1)
    #add edges to the diagonal with cost = max.norm
    for v in range(0,len(I1)):
        cost = (I1[v,1]-I1[v,0])/2
        g.add_edge((v,0),(len(I2)+v,1), weight=cost)
    for v in range(0,len(I2)):
        cost = (I2[v,1]-I2[v,0])/2
        g.add_edge((v,1),(len(I1)+v,0), weight=cost)
    for w in range(0,len(I1)):
        for v in range(0,len(I2)):
            g.add_edge((len(I1)+v,0), (len(I2)+w,1),weight = 0)

    #Get weights and sort them
    weights = nx.get_edge_attributes(g, "weight")
    #sorted_weights = np.sort(np.fromiter(weights.values(),dtype=float))

    #perform binary search
    L=0
    R=max(weights.values())#len(sorted_weights)-1
    print(max(weights.values()))
    eps = 0
    match = None
    while(R-L>0.00001):
        #m = 
        old_eps=eps
        eps = (L+R)/2
        geps=nx.create_empty_copy(g)
        for v in [n for n, d in g.nodes(data=True) if d['bipartite']==0]:
            for w in [n for n, d in g.nodes(data=True) if d['bipartite']==1]:
                if (v,w) in g.edges():
                    if (v,w) in weights:
                        if weights[(v,w)]<=eps:
                            geps.add_edge(v,w)
                    else:
                        if weights[(w,v)]<=eps:
                            geps.add_edge(v,w)
        #Compute matching on subgraph geps
        old_match = match
        match = nx.bipartite.hopcroft_karp_matching(geps,[n for n, d in g.nodes(data=True) if d['bipartite']==0])
        #Compute the amount of unmatched points
        match = [(x[0],y[0]) for x,y in match.items() if x[1]==0]
        number_of_unmatched_points = len(I1)+len(I2)-len(match)
        
        #print("for t = {} the matching has size {}".format(eps, len(match)))
        
        if number_of_unmatched_points > eps:
            L = eps
        elif number_of_unmatched_points < eps:
            R = eps
        
        #if R-L <=1:
         #   if R==L:
          #      break
           # else:
            #    if number_of_unmatched_points > eps:
             #       L=R
              #  elif number_of_unmatched_points < eps:
               #     R=L
        #else:
         #   if number_of_unmatched_points > eps:
          #      L = m
           # elif number_of_unmatched_points < eps:
            #    R = m
            #else:
             #   break
    
    dist = old_eps#min(amount, sorted_weights[m+1]) if m > 0 else eps
    
    if matching:
        return dist, old_match
    else:
        return dist

In [22]:
prokhorov(train_diag[0],train_diag[2])

1.483767


0.12913002143096924