In [1]:
_reduce = lambda c: np.mean(c, axis=0)

def correlate(a, b, mode="valid"):

    if len(a) > len(b):
        a, b = b, a

    a = pd.Series(a)
    b = pd.Series(b)
    n = len(a)

    a = a - np.mean(a)
    b = b - np.mean(b)
    
    c = scipy.signal.correlate(b, a, mode=mode)
    
    if mode == "valid":
        norm = n * np.std(a) * b.rolling(n).std().dropna().values
    elif mode == "same":
        norm = n * np.std(a) * b.rolling(n, min_periods=0, center=True).std().values
    c /= norm
    
    return (c)


def distance(
    obj_a, 
    obj_b, 
    mode="valid", 
    reduce=_reduce, 
    force_triangle_ineq=False
):
    """
    Return the distance between object obj_a and object obj_b.
    
    Arguments:
    - obj_a: object
        First object to consider.
    - obj_b: object
        Second object to consider.
    """
    dist = 1 - np.max(np.abs(ndcorrelate(obj_a, obj_b, mode=mode, reduce=reduce)))
    
    if force_triangle_ineq is True:
        if dist == 0:
            return (0)
        else:
            return ((dist + 1) / 2)

    else:
        return (dist)


def ndcorrelate(a, b, mode="valid", reduce=_reduce):

    assert a.ndim == b.ndim, "a and b must have the same number of dimensions"
    
    if a.ndim == 1:
        return (correlate(a, b, mode=mode))

    assert a.shape[:-1] == b.shape[:-1]
    
    na, nb = a.shape[-1], b.shape[-1]
    
    if na > nb:
        a, b = b, a
        na, nb = nb, na

    a = a.reshape(-1, na)
    b = b.reshape(-1, nb)
    n = a.shape[0]
    
    if mode == "valid":
        c = np.zeros((n, nb - na + 1))
    elif mode == "same":
        c = np.zeros((n, nb))
    for i in range(n):
        c[i] = correlate(a[i], b[i], mode=mode)
    
    return (reduce(c))

In [4]:
import h5py
import itertools
import multiprocessing as mp
import numpy as np
import pandas as pd
import pathlib
import scipy.signal

class FastMap(object):
    
    def __init__(self, Xy, distance, ndim, model_path):
        self._X = Xy["X"]
        self._y = Xy["y"]
        self._distance = distance
        self._ihyprpln = 0
        self._ndim = ndim
        self._init_hdf5(pathlib.Path(model_path))
    
    @property
    def hdf5(self):
        """
        HDF5 model backend.
        """
        
        return (self._hdf5)
    
    @property
    def ndim(self):
        """
        Dimensionality of embedding.
        """
        return (self._ndim)

    
    @property
    def pivots(self):
        """
        Pivot objects.
        """
        
        if "pivots" not in self.hdf5:
            self.hdf5.create_dataset(
                "pivots", 
                (self.ndim, 2, *self.X.shape[1:]), 
                self.X.dtype,
                fillvalue=np.nan
            )
            
        return (self.hdf5["pivots"])
    
    @property
    def pivot_ids(self):
        """
        Indices of pivot objects.
        """
        
        if "pivot_ids" not in self.hdf5:
            self.hdf5.create_dataset(
                "pivot_ids", 
                (self.ndim, 2), 
                np.uint16,
                fillvalue=np.nan
            )
            
        return (self.hdf5["pivot_ids"])
    
    
    @property
    def pivot_labels(self):
        """
        Labels of pivot objects.
        """
        if "pivot_labels" not in self.hdf5:
            self.hdf5.create_dataset(
                "pivot_labels", 
                (self.ndim, 2), 
                np.int16,
                fillvalue=-1
            )
            
        return (self.hdf5["pivot_labels"])
    
    @property
    def W(self):
        
        if "W" not in self.hdf5:
            self.hdf5.require_dataset(
                "W", 
                (self.X.shape[0], self.ndim), 
                np.float32, 
                exact=True,
                fillvalue=np.nan
            )
            
        return (self.hdf5["W"])
        
    @property
    def X(self):
        
        return (self._X)
    
    @property
    def y(self):
        
        return (self._y)
    
    def _choose_pivots(self):
        """
        A heuristic algorithm to choose distant pivot objects 
        (Faloutsos and Lin, 1995).
        """
        
        jobj = np.random.choice(np.argwhere(self.y[:] == 1).flatten())
        
        while jobj in self.pivot_ids[:self._ihyprpln].flatten():
            jobj = np.random.choice(np.argwhere(self.y[:] == 1).flatten())

        iobj = self.furthest(jobj, label=0)
        jobj = self.furthest(iobj, label=1)
        
        return (iobj, jobj)

    def _init_hdf5(self, path):
        """
        Initialize the HDF5 backend to store pivot objects and images
        of training data.
        
        Arguments:
        - path: pathlib.Path
            The path to the backend. Open as read-only if it already;
            exists; as read/write otherwise.
        """
        if path.exists():
            self._hdf5 = h5py.File(path, mode="r")
        else:
            self._hdf5 = h5py.File(path, mode="w")
            
        self._hdf5.create_dataset("y", data=self.y)
            
        return (True)
    

    def distance(self, iobj, jobj):
        """
        Return the distance between object at index iobj and object at
        index jobj on the ihyprpln^th hyperplane.
        
        Arguments:
        - iobj: int
            Index of first object to consider.
        - jobj: int
            Index of second object to consider.
        
        Keyword arguments:
        - ihyprpln: int=0
            Index of hyperplane on which to compute distance.
        """

        dist = self._distance(self.X[iobj], self.X[jobj])
                    
        for i in range(self._ihyprpln):
            dist = np.sqrt(dist**2 - (self.W[iobj, i] - self.W[jobj, i])**2)

        return (dist)

    def pembed(self, X):
        """
        Return the embedding (image) of the given object.
        """
        
        image = np.zeros((X.shape[0], self.ndim), dtype=np.float32)
        
        for self._ihyprpln in range(self.ndim):
            
            ipiv, jpiv = self.pivot_ids[self._ihyprpln]
            d_ij = self.distance(ipiv, jpiv)
            
            with mp.Pool() as pool:
                d_ik = self._pdistance(self.X[ipiv], X)
                d_jk = self._pdistance(self.X[jpiv], X)
            
            for i in range(self._ihyprpln):
                d_ik = np.sqrt(np.square(d_ik) - np.square(image[:, i] - self.image[ipiv, i]))
                d_jk = np.sqrt(np.square(d_jk) - np.square(image[:, i] - self.image[jpiv, i]))
            
            image[:, self._ihyprpln] = (np.square(d_ik) + np.square(d_ij) - np.square(d_jk))  /  (2 * d_ij)
            
        return (image)


    def embed_database(self):
        """
        Compute and store the image of every object in the database.
        """
        
        n = self.X.shape[0]
        
        for self._ihyprpln in range(self.ndim):

            ipiv, jpiv = self._choose_pivots()
            self.pivot_ids[self._ihyprpln] = [ipiv, jpiv]
            self.pivots[self._ihyprpln, 0] = self.X[ipiv]
            self.pivots[self._ihyprpln, 1] = self.X[jpiv]
            d_ij = self.distance(ipiv, jpiv)
            
            d  = np.square(self.pdist(np.arange(n), ipiv))
            d -= np.square(self.pdist(np.arange(n), jpiv))
            d += d_ij ** 2
            d /= (2 * d_ij)
            self.W[:, self._ihyprpln] = d

        return (True)
    
    def furthest(self, iobj, label=None):
        """
        Return the index of the object furthest from object with index 
        *iobj*.
        """

        if label is None:
            idxs = np.arange(self.y.shape[0])
        else:
            idxs = np.argwhere(self.y[:] == label).flatten()
        
        return (idxs[np.argmax(self.pdist(iobj, idxs))])
    
    def pdist(self, iobj, jobj):

        iobj = np.atleast_1d(iobj)
        jobj = np.atleast_1d(jobj)
        
        with mp.Pool(initializer=_init_pdist_idx, initargs=(self,)) as pool:
            iterator = itertools.product(iobj, jobj)
            
            return (np.array(pool.map(_pdist_idx, iterator)))
            

def _init_pdist_idx(fastmap):

    global self
    
    self = fastmap

    
def _pdist_idx(args):
    
    global self
    
    return (self.distance(*args))


# def _init_pdist2(fastmap, _X):

#     global self, X
    
#     self = fastmap
#     X = _X

    
# def _pdist2(args):
    
#     global self, X

#     iobj, jobj = args
#     dist = self._distance(X[iobj], X[jobj])

# #     for i in range(self._ihyprpln):
# #         dist = np.sqrt(dist**2 - (self.W[iobj, i] - self.W[jobj, i])**2)

# #     return (dist)

In [5]:
%%time
path = pathlib.Path("fastmap.hdf5")
path.unlink(missing_ok=True)
with h5py.File("../data/training.hdf5", mode="r") as f5:
# with h5py.File("test_data.hdf5", mode="r") as f5:
    fastmap = FastMap(
        f5,
        lambda a, b: distance(a, b, mode="same"),
        6,
        path
    )
    fastmap.embed_database()
    with h5py.File("../data/test.hdf5", mode="r") as f5test:
        W = fastmap.embed(f5test["X"])

[[-1.6051118  -1.1754749  -0.1552806  ... -2.792238   -1.8458971
  -0.33222544]
 [ 0.6623622   0.09316956 -1.9061961  ... -2.6752884  -3.687672
  -3.2921653 ]
 [-0.29494035  0.08362778 -0.14362009 ...  1.4218868   0.12919605
  -1.7596445 ]]
[[ -0.39624056   0.6508803    3.0646873  ... -20.766155    -6.6349654
    7.6461926 ]
 [ 22.035116    14.49796     -4.7831364  ...   6.4207683    8.497943
    4.1987743 ]
 [  4.5506625   -3.4072106  -10.726917   ...  12.793018    -1.4162449
  -12.795116  ]]


TypeError: h5py objects cannot be pickled

In [None]:
with h5py.File("fastmap.hdf5", mode="r") as f5:
    W = f5["W"][:]
    y = f5["y"][:]

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns; sns.set_theme()

dataf = pd.DataFrame(W)
dataf["target"] = y

sns.pairplot(dataf, hue="target")

In [None]:
with h5py.File("/home/malcolmw/Downloads/scsn_p_2000_2017_6sec_0.5r_fm_test (1).hdf5", mode="r+") as f5:
    with h5py.File("test_data.hdf5", mode="w") as f5out:
        N = 2 ** 12
        f5out.create_dataset("X", data=f5["X"][:N])
        f5out.create_dataset("y", data=f5["y"][:N])