# Approximate Bottleneck Distance

In [1]:
import numpy as np
import scipy.linalg
import scipy.stats
import sklearn.metrics
from sklearn.cluster import KMeans
import numba
import matplotlib.pyplot as plt
import ot
import umap
import seaborn as sns

## Adding more example diagrams


In [2]:
from teaspoon.MakeData.PointCloud import testSetManifolds 
from teaspoon.TDA.Distance import dgmDist_Hera

Were are going to generate 50 small examples from each of the 6 classes

In [3]:
%%time
manifoldData = testSetManifolds(numDgms = 30, numPts = 300, permute = False, seed = 0)

Generating torus clouds...
Generating annuli clouds...
Generating cube clouds...
Generating three cluster clouds...
Generating three clusters of three clusters clouds...
Generating sphere clouds...
Finished generating clouds and computing persistence.

CPU times: user 59.4 s, sys: 503 ms, total: 59.9 s
Wall time: 15.1 s


In [4]:
# We are just going to consider the 1-dimension persistence 

# In birth-death
JustDgms_death = list(manifoldData['Dgm1'])
# In birth-lifetime
JustDgms_lifetime = [np.concatenate([[X[:,0]],[X[:,1]-X[:,0]]], axis = 0).T for X in JustDgms_death]

Here is the wasserstein code... there is a hack in here than needs to be fixed where we use sklean all-pairs distance function which is only doing L_2 distance at the moment and so we are not correctly doing L_p distance in our calculations... we need to fix this in the main code base.  Unfortunely, there is no sklearn L_p norms for other $p \ne 1,2,\infty$ so we'll need to write something if we want to do that correctly. 


In [5]:
def wasserstein_diagram_distance(p, pts0, pts1, y_axis='death'):
    '''
    Compute the Persistant p-Wasserstein distance between the diagrams pts0, pts1
    
    y_axis = 'death' (default), or 'lifetime'
    
    '''
    
    if y_axis == 'lifetime':
        extra_dist0 = pts0[:, 1]
        extra_dist1 = pts1[:, 1]
    elif y_axis == 'death':    
        extra_dist0 = (pts0[:, 1]-pts0[:, 0])/np.sqrt(2)
        extra_dist1 = (pts1[:, 1]-pts1[:, 0])/np.sqrt(2)
    else:
        raise ValueError('y_axis must be \'death\' or \'lifetime\'')
        
    pairwise_dist = sklearn.metrics.pairwise_distances(pts0, pts1)
    
    all_pairs_ground_distance_a = np.hstack([pairwise_dist, extra_dist0[:, np.newaxis]])
    extra_row = np.zeros(all_pairs_ground_distance_a.shape[1])
    extra_row[:pairwise_dist.shape[1]] = extra_dist1
    all_pairs_ground_distance_a = np.vstack([all_pairs_ground_distance_a, extra_row])
  
    all_pairs_ground_distance_a = all_pairs_ground_distance_a**p
    
    n0 = pts0.shape[0]
    n1 = pts1.shape[0]
    a = np.ones(n0+1)
    a[n0]=n1
    a = a/a.sum()
    b = np.ones(n1+1)
    b[n1]=n0
    b = b/b.sum()
    
    return np.power((n0+n1)*ot.emd2(a, b, all_pairs_ground_distance_a),1.0/p)


### Modifications to do approximate bottleneck

Here we swich the metric to 'chebychev' which is $L_\infty$.  Also, the transport cost we want to return is the max cost to move any element given the transport plan - not the total cost.  Ideally if this was always a matching this would be the highest cost of an entry in this matrix but there could be mass splitting in the retruned solution in theory so we sum up the total cost to move each element and then take the max of that to fix that issue.   

Now, the optimal transport code is going to minimize total transport cost not the maximal transport cost, but in theory we can now take advantage of the limit and just raise all of the transport costs to the p-th power and find the optimal transport of that, which will basically be forced to minimize the maximal cost as a result.  Using this plan we compute the max cost under the original $L_\infty$ cost matrix without the p-th powers and take the max row sum / col sum of that. 

Lasty, because we are grouping all of the points at infinity together we actually only want to find the maximal cost of moving one of the real points in one of the diagrams (which will be equal if they move to eachother), so we have to take some care to remove the infinite points (the last row/columns) when we are looking for the most costly move. 

In [6]:
def bottleneck_diagram_distance(p, pts0, pts1, y_axis='death'):
    '''
    Compute the Persistant p-Wasserstein distance between the diagrams pts0, pts1
    
    y_axis = 'death' (default), or 'lifetime'
    
    '''
    
    if y_axis == 'lifetime':
        extra_dist0 = pts0[:, 1]
        extra_dist1 = pts1[:, 1]
    elif y_axis == 'death':    
        extra_dist0 = (pts0[:, 1]-pts0[:, 0])/np.sqrt(2)
        extra_dist1 = (pts1[:, 1]-pts1[:, 0])/np.sqrt(2)
    else:
        raise ValueError('y_axis must be \'death\' or \'lifetime\'')
        
    pairwise_dist = sklearn.metrics.pairwise_distances(pts0, pts1, metric='chebyshev')
    
    all_pairs_ground_distance_a = np.hstack([pairwise_dist, extra_dist0[:, np.newaxis]])
    extra_row = np.zeros(all_pairs_ground_distance_a.shape[1])
    extra_row[:pairwise_dist.shape[1]] = extra_dist1
    all_pairs_ground_distance_a = np.vstack([all_pairs_ground_distance_a, extra_row])
  
    all_pairs_ground_distance_ap = np.power(all_pairs_ground_distance_a,p)
    
    n0 = pts0.shape[0]
    n1 = pts1.shape[0]
    a = np.ones(n0+1)
    a[n0]=n1
    a = a/a.sum()
    b = np.ones(n1+1)
    b[n1]=n0
    b = b/b.sum()
    
    T=ot.emd(a, b, all_pairs_ground_distance_ap)
    
    return (n0+n1)*np.max([np.max(np.sum(T[:-1,:]*all_pairs_ground_distance_a[:-1,:],axis=1)),
                            np.max(np.sum(T[:,:-1]*all_pairs_ground_distance_a[:,:-1],axis=0))])



## Now lets see how this converges as we vary p 


In [7]:
def allP(n,p):
    bott_all_pairs_dist = np.zeros((n,n))
    for i in range(n):
        for j in range(i,n):
            bott_all_pairs_dist[i,j] = bottleneck_diagram_distance(p, 
                                                JustDgms_death[i], JustDgms_death[j], y_axis='death')
            bott_all_pairs_dist[j,i] = bott_all_pairs_dist[i,j]
    return bott_all_pairs_dist



In [16]:
%%time
b=[allP(180,i) for i in range(1,20)]

CPU times: user 5min 21s, sys: 1.7 s, total: 5min 22s
Wall time: 5min 23s


In [17]:
[np.min(b[i+1]-b[i]) for i in range(0,18)]

[-0.218944013118744,
 -0.12068563699722856,
 -0.10629329827326195,
 -0.10629329827326195,
 -0.0731831196676862,
 -0.029865720470156232,
 -0.04857542302055895,
 -0.05577355836384701,
 -0.04126838174736036,
 -0.030069558905940652,
 -0.03743430404542425,
 -0.026462428271770477,
 -0.02610932290554016,
 -0.06569533050060225,
 -0.07910273969173434,
 -0.06042257696390152,
 -0.11070293188095093,
 -0.082641676068306]

In [18]:
[np.max(b[i+1]-b[i]) for i in range(0,18)]

[0.039795751986053784,
 0.013971745967861549,
 0.02799259124559783,
 0.001862168312072754,
 0.00010000169277191336,
 0.006368778645992281,
 0.01115706562995911,
 0.02517920732498169,
 0.035367563366889954,
 0.05232001841068257,
 0.06030437350273134,
 0.07400456070899983,
 0.08038553595542905,
 0.09151804447174089,
 0.1251210868358606,
 0.1353657320141792,
 0.15817624330520588,
 0.20555180311203006]

In [19]:
[np.mean(b[i+1]-b[i]) for i in range(0,18)]

[-0.00783353960211826,
 -0.000804872932239556,
 -0.0002584156440280042,
 -0.00011767282682853872,
 -9.995642654779577e-05,
 -1.861798903010159e-05,
 5.171477596679754e-05,
 0.000331783872959272,
 0.0010364353571947415,
 0.0019747717956783545,
 0.0027850399078617845,
 0.003604564659027123,
 0.004625878979598216,
 0.004768521178867411,
 0.005073627228826866,
 0.005163460898828659,
 0.005393664313846954,
 0.00528265475136577]

After about p=7 it seems that numerical precision starts to be an issue since it appears to stop converging around then.  

#### Let's see how it compares to Hera to see if it's converging to the right thing. 

In [20]:
dgmDist_Hera(JustDgms_death[1],JustDgms_death[2])

ValueError: could not convert string to float: '/bin/sh: bottleneck_dist: command not found'

##### I'm not sure how to fix my call to Hera 