# Application Classification 
https://gunrock.github.io/docs/#/hive/hive_application_classification
```bash
git clone git@github.com:owensgroup/application_classification.git cub
git clone git@gitlab.hiveprogram.com:pnnl/ApplicationClassification.git pnnl
git clone git@gitlab.hiveprogram.com:wcude/ApplicationClassification.git wcude  # same as pnnl
git clone git@gitlab.hiveprogram.com:jcromano/applicationClassification.git spark
```
This notebook is based on `test.py` from `cub`.  We have both NumPy and GraphBLAS implementations side-by-side.

## Init

In [None]:
import grblas as gb
from grblas import *
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

data_vertex    = pd.read_csv('./data/georgiyData.Vertex.csv', skiprows=1, sep=' ', header=None)
pattern_vertex = pd.read_csv('./data/georgiyPattern.Vertex.csv', skiprows=1, sep=' ', header=None)

data_edges    = pd.read_csv('./data/georgiyData.Edges.csv', skiprows=1, sep=' ', header=None)
pattern_edges = pd.read_csv('./data/georgiyPattern.Edges.csv', skiprows=1, sep=' ', header=None)

assert (data_vertex[0] == data_vertex.index).all()
assert (pattern_vertex[0] == pattern_vertex.index).all()

data_vertex      = data_vertex.values[:,1:]
data_edges_table = data_edges[list(range(2, data_edges.shape[1]))].values
data_edges       = data_edges[[0, 1]].values

pattern_vertex      = pattern_vertex.values[:,1:]
pattern_edges_table = pattern_edges[list(range(2, pattern_edges.shape[1]))].values
pattern_edges       = pattern_edges[[0, 1]].values

num_dv   = data_vertex.shape[0]
num_pv   = pattern_vertex.shape[0]

num_de   = data_edges.shape[0]
num_pe   = pattern_edges.shape[0]

edge_dim = pattern_edges.shape[1]

In [None]:
data_edges = data_edges[np.lexsort(np.rot90(data_edges))]  # XXX: should we sort edges lexicographically?

In [None]:
def isclose(gb_x, np_y):
    if isinstance(gb_x, Vector):
        return gb_x.isclose(Vector.ss.import_full(np_y))
    elif isinstance(gb_x, Matrix):
        return gb_x.isclose(Matrix.ss.import_fullr(np_y))
    else:  # Scalar
        return gb_x.isclose(np_y)

In [None]:
data_vertex_gb = Matrix.ss.import_fullr(data_vertex)
data_edges_table_gb = Matrix.ss.import_fullr(data_edges_table)
data_edges_gb = Matrix.ss.import_fullr(data_edges)
pattern_vertex_gb = Matrix.ss.import_fullr(pattern_vertex)
pattern_edges_table_gb = Matrix.ss.import_fullr(pattern_edges_table)
pattern_edges_gb = Matrix.ss.import_fullr(pattern_edges)

In [None]:
data_gb = Matrix.from_values(data_edges[:, 0], data_edges[:, 1], 1)
data_gb

In [None]:
pattern_gb = Matrix.from_values(pattern_edges[:, 0], pattern_edges[:, 1], 1)
pattern_gb

In [None]:
def normprob(x):
    x = (x - x.max(axis=0, keepdims=True)).copy()
    return np.log(np.exp(x) / np.exp(x).sum(axis=0, keepdims=True))

def l2_norm(x):
    return np.sqrt((x ** 2).sum())

In [None]:
def normprob_gb(x):
    x = op.any_minus(x @ ss.diag(x.reduce_columnwise(op.max)))
    return op.any_minus(x @ ss.diag(x.reduce_columnwise(agg.logaddexp))).new()

## Vertex similarity

In [None]:
assert isclose(normprob_gb(data_vertex_gb), normprob(data_vertex))

In [None]:
assert isclose(data_vertex_gb.reduce_scalar(agg.L2norm).new(), l2_norm(data_vertex))

```C
Init_CV_MU(Data_Graph, Pattern_Graph, WA.CV, WA.MU);  // NodePairwiseNorm
```

In [None]:
cv = cdist(data_vertex, pattern_vertex)
cv

In [None]:
def cdist_gb(X, Y):
    # This is not the most numerically stable algorithm!
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.euclidean_distances.html#sklearn.metrics.pairwise.euclidean_distances
    XX = X.reduce_rows(agg.sum_of_squares)
    YY = Y.reduce_rows(agg.sum_of_squares)
    tempY = -2 * Y  # Y is smallest in this notebook, so apply `-2` here
    rv = X @ tempY.T
    rv = op.any_plus(ss.diag(XX) @ rv)
    rv = op.any_plus(rv @ ss.diag(YY))
    return op.sqrt(rv).new()

In [None]:
cv_gb = cdist_gb(data_vertex_gb, pattern_vertex_gb)
assert isclose(cv_gb, cv)

```C
NormProb(DV, PV, WA.CV);  // ColumnSoftmax
NormProb(DV, PV, WA.MU);  // ColumnSoftmax
```

In [None]:
mu = normprob(-cv)
cv = normprob(cv)

In [None]:
mu_gb = normprob_gb(-cv_gb)
cv_gb = normprob_gb(cv_gb)

In [None]:
assert isclose(mu_gb, mu)
assert isclose(cv_gb, cv)

Maybe?
```C
Init_VR_VF(Data_Graph, Pattern_Graph, WA.MU, WA.VR, WA.VF);  // RepeatColumnsByPatternEdges
```

Maybe?
```C
VFmax_VRmax(Data_Graph, Pattern_Graph, WA.VF, WA.VR, WA.VFmax, WA.VRmax);  // ColumnMax
```

In [None]:
mu_max = mu.max(axis=0)

In [None]:
mu_max_gb = mu_gb.reduce_columnwise(op.max).new()
assert isclose(mu_max_gb, mu_max)

In [None]:
v_fwd_max = np.zeros(num_pe)
v_bak_max = np.zeros(num_pe)
for i, (src, dst) in enumerate(pattern_edges):
    v_bak_max[i] = mu_max[src]
    v_fwd_max[i] = mu_max[dst]

In [None]:
v_bak_max_graph_gb = op.any_first(ss.diag(mu_max_gb) @ pattern_gb).new()
assert v_bak_max_graph_gb.isclose(Matrix.from_values(pattern_edges[:, 0], pattern_edges[:, 1], v_bak_max))

In [None]:
v_bak_max_graph_gb.wait()
_, _, v_bak_max_gb = v_bak_max_graph_gb.to_values()
v_bak_max_gb = Vector.ss.import_full(v_bak_max_gb)
assert isclose(v_bak_max_gb, v_bak_max)

In [None]:
v_fwd_max_graph_gb = op.any_second(pattern_gb @ ss.diag(mu_max_gb)).new()
v_fwd_max_graph_gb.isclose(Matrix.from_values(pattern_edges[:, 0], pattern_edges[:, 1], v_fwd_max))

In [None]:
v_fwd_max_graph_gb.wait()
_, _, v_fwd_max_gb = v_fwd_max_graph_gb.to_values()
v_fwd_max_gb = Vector.ss.import_full(v_fwd_max_gb)
assert isclose(v_fwd_max_gb, v_fwd_max)

## Edge similarity

```C
Init_CE_RE_FE(Data_Graph, Pattern_Graph, WA.CE, WA.RE, WA.FE);  // EdgePairwiseNorm
NormProb(DE, PE, WA.CE);  // ColumnSoftmax
NormProb(DE, PE, WA.RE);  // ColumnSoftmax
NormProb(DE, PE, WA.FE);  // ColumnSoftmax
```

In [None]:
ce = cdist(data_edges_table, pattern_edges_table)
xe = normprob(-ce)
ce = normprob(ce)

In [None]:
ce_gb = cdist_gb(data_edges_table_gb, pattern_edges_table_gb)
xe_gb = normprob_gb(-ce_gb)
ce_gb = normprob_gb(ce_gb)

In [None]:
assert isclose(ce_gb, ce)
assert isclose(xe_gb, xe)

## Combine

```C
Init_Cnull(Data_Graph, Pattern_Graph, WA.CE, WA.Cnull);  // ???
NormProb(1, PE, WA.Cnull);  // ColumnSoftmax
```

In [None]:
# >>
# cnull = np.sqrt((pattern_edges_table ** 2).sum(axis=-1))
# cnull = np.maximum(cnull, ce.max(axis=0))
# cnull = normprob(cnull)
# --
cnull = np.zeros(num_pe) # bug in code?
# <<

In [None]:
cnull_gb = Vector.new(float, size=num_pe)
cnull_gb << 0

```C
FMax(Data_Graph, Pattern_Graph, WA.Cnull, WA.VRmax, WA.FE, WA.FMax);  // EdgeMaxReduce
RMax(Data_Graph, Pattern_Graph, WA.Cnull, WA.VFmax, WA.RE, WA.RMax);  // EdgeMaxReduce
```

In [None]:
fwd_max = np.zeros((num_dv, num_pe))
bak_max = np.zeros((num_dv, num_pe))
# possible alternative to avoid if-else below
# fwd_max = np.repeat(v_bak_max[np.newaxis, :], num_pe, axis=0)
# bak_max = np.repeat(v_fwd_max[np.newaxis, :], num_pe, axis=0)

fwd_touched = set([])
bak_touched = set([])
for edge_idx, (src, dst) in enumerate(data_edges):
    if dst not in fwd_touched:
        fwd_max[dst] = np.maximum(v_bak_max, xe[edge_idx])
        #fwd_max[dst] = np.minimum(v_bak_max, xe[edge_idx])  # XXX
        fwd_touched.add(dst)
    else:
        fwd_max[dst] = np.maximum(fwd_max[dst], xe[edge_idx])
        #fwd_max[dst] = np.minimum(fwd_max[dst], xe[edge_idx])  # XXX

    if src not in bak_touched:
        bak_max[src] = np.maximum(v_fwd_max, xe[edge_idx])
        #bak_max[src] = np.minimum(v_fwd_max, xe[edge_idx])  # XXX
        bak_touched.add(src)
    else:
        bak_max[src] = np.maximum(bak_max[src], xe[edge_idx])
        #bak_max[src] = np.minimum(bak_max[src], xe[edge_idx])  # XXX

In [None]:
dr, dc, _ = data_gb.to_values()

In [None]:
data_fwd_graph = Matrix.from_values(dc, np.arange(data_gb.nvals), 1)
fwd_max_gb = op.max_second(data_fwd_graph @ xe_gb).new()
fwd_max_gb = op.any_max(fwd_max_gb @ ss.diag(v_bak_max_gb)).new()
assert isclose(fwd_max_gb, fwd_max)

In [None]:
data_bak_graph = Matrix.from_values(dr, np.arange(data_gb.nvals), 1)
bak_max_gb = op.max_second(data_bak_graph @ xe_gb).new()
bak_max_gb = op.any_max(bak_max_gb @ ss.diag(v_fwd_max_gb)).new()
assert isclose(bak_max_gb, bak_max)

In [None]:
if False:
    # check when running the above with min to see if we iterate over patterns correctly
    fwd_max_gb = op.min_second(Matrix.from_values(c, np.arange(data_gb.nvals), 1) @ xe_gb).new()
    fwd_max_gb = op.any_min(fwd_max_gb @ ss.diag(v_bak_max_gb)).new()
    bak_max_gb = op.min_second(Matrix.from_values(r, np.arange(data_gb.nvals), 1) @ xe_gb).new()
    bak_max_gb = op.any_min(bak_max_gb @ ss.diag(v_fwd_max_gb)).new()
    assert isclose(fwd_max_gb, fwd_max)
    assert isclose(fwd_max_gb, fwd_max)

### Loop

```C
VF_VR(Data_Graph, Pattern_Graph, WA.MU, WA.FMax, WA.RMax, WA.VF, WA.VR);  // RepeatColumnsByPatternEdgesSubtract
VFmax_VRmax(Data_Graph, Pattern_Graph, WA.VF, WA.VR, WA.VFmax, WA.VRmax); // ColumnMax
FE_RE(Data_Graph, Pattern_Graph, WA.CE, WA.VF, WA.VR, WA.FE, WA.RE);      // RepeatColumnsByDataEdges
NormProb(DE, PE, WA.FE);                                                  // ColumnSoftmax
NormProb(DE, PE, WA.RE);                                                  // ColumnSoftmax
FMax(Data_Graph, Pattern_Graph, WA.Cnull, WA.VRmax, WA.FE, WA.FMax);      // EdgeMaxReduce
RMax(Data_Graph, Pattern_Graph, WA.Cnull, WA.VFmax, WA.RE, WA.RMax);      // EdgeMaxReduce
MU(Data_Graph, Pattern_Graph, WA.CV, WA.FMax, WA.RMax, WA.MU);            // ComputeMU
NormProb(DV, PV, WA.MU);                                                  // ColumnSoftmax
```

In [None]:
v_fwd = np.zeros((num_dv, num_pe))
v_bak = np.zeros((num_dv, num_pe))

In [None]:
pattern_gb.wait()
pr, pc, _ = pattern_gb.to_values()

In [None]:
# for _ in range(num_pv):

In [None]:
for p_edge_idx, (src, dst) in enumerate(pattern_edges):
    v_fwd[:,p_edge_idx] = mu[:,dst] - fwd_max[:,p_edge_idx]
    v_bak[:,p_edge_idx] = mu[:,src] - bak_max[:,p_edge_idx]

In [None]:
pattern_fwd_graph = Matrix.from_values(pc, np.arange(pattern_gb.nvals), 1)
v_fwd_gb = op.any_first(mu_gb @ pattern_fwd_graph).new()
v_fwd_gb = op.minus(v_fwd_gb & fwd_max_gb).new()
assert isclose(v_fwd_gb, v_fwd)

pattern_bak_graph = Matrix.from_values(pr, np.arange(pattern_gb.nvals), 1)
v_bak_gb = op.any_first(mu_gb @ pattern_bak_graph).new()
v_bak_gb = op.minus(v_bak_gb & bak_max_gb).new()
assert isclose(v_bak_gb, v_bak)

maybe?
```C
VFmax_VRmax(Data_Graph, Pattern_Graph, WA.VF, WA.VR, WA.VFmax, WA.VRmax); // ColumnMax
```

In [None]:
v_fwd_max = v_fwd.max(axis=0)
v_bak_max = v_bak.max(axis=0)

In [None]:
v_fwd_max_gb = v_fwd_gb.reduce_columnwise(op.max).new()
assert isclose(v_fwd_max_gb, v_fwd_max)

v_bak_max_gb = v_bak_gb.reduce_columnwise(op.max).new()
assert isclose(v_bak_max_gb, v_bak_max)

In [None]:
e_bak = v_fwd[data_edges[:,0]] - ce
e_fwd = v_bak[data_edges[:,0]] - ce

In [None]:
e_bak_gb = op.any_second(data_bak_graph.T @ v_fwd_gb).new()
e_bak_gb = op.minus(e_bak_gb & ce_gb).new()
assert isclose(e_bak_gb, e_bak)

e_fwd_gb = op.any_second(data_bak_graph.T @ v_bak_gb).new()
e_fwd_gb = op.minus(e_fwd_gb & ce_gb).new()
assert isclose(e_fwd_gb, e_fwd)

In [None]:
e_bak_norm = np.log(np.exp(e_bak).sum(axis=0, keepdims=True))
e_fwd_norm = np.log(np.exp(e_fwd).sum(axis=0, keepdims=True))

In [None]:
# Why doesn't this use normprob_gb?
e_bak_norm_gb = e_bak_gb.reduce_columnwise(op.numpy.logaddexp).new()
assert isclose(e_bak_norm_gb, e_bak_norm[0, :])

e_fwd_norm_gb = e_fwd_gb.reduce_columnwise(op.numpy.logaddexp).new()
assert isclose(e_fwd_norm_gb, e_fwd_norm[0, :])

In [None]:
fwd_max = np.zeros((num_dv, num_pe)) - np.inf # num_dv x num_pe
bak_max = np.zeros((num_dv, num_pe)) - np.inf # num_dv x num_pe

sel = np.argsort(data_edges[:,0],  kind='mergesort')  # XXX: why?
for d_edge_idx, (src, dst) in enumerate(data_edges[sel]):
    bak_max[src] = np.maximum(bak_max[src], e_bak[d_edge_idx])

for d_edge_idx, (src, dst) in enumerate(data_edges[sel]):
    fwd_max[dst] = np.maximum(fwd_max[dst], e_fwd[d_edge_idx])

In [None]:
fwd_max_gb = op.max_second(data_fwd_graph @ e_fwd_gb).new()
assert isclose(fwd_max_gb, fwd_max)

bak_max_gb = op.max_second(data_bak_graph @ e_bak_gb).new()
assert isclose(bak_max_gb, bak_max)

In [None]:
fwd_max -= e_fwd_norm
bak_max -= e_bak_norm

In [None]:
fwd_max_gb = op.any_minus(fwd_max_gb @ ss.diag(e_fwd_norm_gb)).new()
assert isclose(fwd_max_gb, fwd_max)

bak_max_gb = op.any_minus(bak_max_gb @ ss.diag(e_bak_norm_gb)).new()
assert isclose(bak_max_gb, bak_max)

In [None]:
fwd_max = np.maximum(fwd_max, (v_bak_max - cnull).reshape(1, -1))
bak_max = np.maximum(bak_max, (v_fwd_max - cnull).reshape(1, -1))

In [None]:
fwd_max_gb = op.any_max(fwd_max_gb @ ss.diag(op.minus(v_bak_max_gb & cnull_gb))).new()
assert isclose(fwd_max_gb, fwd_max)

bak_max_gb = op.any_max(bak_max_gb @ ss.diag(op.minus(v_fwd_max_gb & cnull_gb))).new()
assert isclose(bak_max_gb, bak_max)

```C
MU(Data_Graph, Pattern_Graph, WA.CV, WA.FMax, WA.RMax, WA.MU);  // ComputeMU
NormProb(DV, PV, WA.MU);  // ColumnSoftmax
```

In [None]:
mu = -cv
for p_edge_idx, (src, dst) in enumerate(pattern_edges):
    mu[:,dst] += fwd_max[:,p_edge_idx]
    mu[:,src] += bak_max[:,p_edge_idx]
mu = normprob(mu)

In [None]:
mu_gb = op.ainv(cv_gb).new()
mu_gb(op.plus) << op.plus_plus(fwd_max_gb @ pattern_fwd_graph.T)
mu_gb(op.plus) << op.plus_plus(bak_max_gb @ pattern_bak_graph.T)
mu_gb = normprob_gb(mu_gb)
assert isclose(mu_gb, mu)