In [1]:
import torch
from gnn_tracking.training.tc import TCModule
from pathlib import Path
from torch.profiler import profile, record_function, ProfilerActivity

# from object_condensation.pytorch.losses import condensation_loss

In [2]:
chkpt_home = Path(
    "/home/kl5675/Documents/23/git_sync/hyperparameter_optimization2/scripts/pixel/lightning_logs/"
)
assert chkpt_home.is_dir()
chkpt_path = (
    chkpt_home
    / "vagabond-tasteful-hyrax/checkpoints_persist/epoch=451-step=406800.ckpt"
)
assert chkpt_path.is_file()
data_home = Path(
    "/scratch/gpfs/IOJALVO/gnn-tracking/object_condensation/point_clouds_v8/"
)
assert data_home.is_dir()
data_path = data_home / "part_1" / "data21000_s0.pt"
assert data_path.is_file()

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
lmodel = TCModule.load_from_checkpoint(chkpt_path, map_location=device)

[36m[10:07:07] DEBUG: Getting class PreTrainedECGraphTCN from module gnn_tracking.models.track_condensation_networks[0m
  rank_zero_warn(
[36m[10:07:07] DEBUG: Getting class MLGraphConstruction from module gnn_tracking.models.graph_construction[0m
[36m[10:07:07] DEBUG: Getting class GraphConstructionFCNN from module gnn_tracking.models.graph_construction[0m
[36m[10:07:07] DEBUG: Getting class PotentialLoss from module gnn_tracking.metrics.losses[0m
[36m[10:07:07] DEBUG: Getting class DBSCANHyperParamScanner from module gnn_tracking.postprocessing.dbscanscanner[0m


In [5]:
data = torch.load(data_path)
data.to(device)
assert data
model = lmodel.model

In [6]:
dp = lmodel.preproc(data)

In [7]:
class MemLogger:
    def __init__(self):
        self.mem = 0

    def log(self, desc=""):
        current = torch.cuda.memory_allocated() / 1e9
        added = current - self.mem
        print(f"{desc:<30} added {added:>8.2f} GB, total {current:>8.2f} GB")
        self.mem = current

In [8]:
from torch import Tensor as T
from torch.nn.functional import relu


def condensation_loss(
    *,
    beta: T,
    x: T,
    object_id: T,
    weights: T,
    q_min: float,
    noise_threshold: int,
) -> dict[str, T]:
    # To protect against nan in divisions
    eps = 1e-9

    # x: n_nodes x n_outdim
    not_noise = object_id > noise_threshold
    unique_oids = torch.unique(object_id[not_noise])
    assert len(unique_oids) > 0, "No particles found, cannot evaluate loss"
    # n_nodes x n_pids
    # The nodes in every column correspond to the hits of a single particle and
    # should attract each other
    attractive_mask = object_id.view(-1, 1) == unique_oids.view(1, -1)

    q = torch.arctanh(beta) ** 2 + q_min
    assert not torch.isnan(q).any(), "q contains NaNs"
    # n_objs
    alphas = torch.argmax(q.view(-1, 1) * attractive_mask, dim=0)

    # _j means indexed by hits
    # _k means indexed by objects

    # n_objs x n_outdim
    x_k = x[alphas]
    # 1 x n_objs
    q_k = q[alphas].view(1, -1)

    dist_j_k = torch.cdist(x, x_k)

    qw_j_k = weights.view(-1, 1) * q.view(-1, 1) * q_k

    repulsive_mask = (~attractive_mask) & (dist_j_k < 1)
    # We have to include the hits-per-object normalization factor here, because
    # after applying the mask we only have a 1D tensor anymore
    qw_att_j_k = (qw_j_k / (attractive_mask.sum(dim=0) + eps))[attractive_mask]
    qw_rep_j_k = (qw_j_k / ((~attractive_mask).sum(dim=0) + eps))[repulsive_mask]

    # Attractive potential/loss
    v_att_j_k = qw_att_j_k * torch.square(dist_j_k)[attractive_mask]
    # It's important to directly do the .mean here so we don't keep these large
    # matrices in memory longer than we need them
    # Attractive potential per object normalized over number of hits in object
    v_att_k = torch.sum(v_att_j_k, dim=0)
    v_att = torch.sum(v_att_k) / len(unique_oids)

    # Repulsive potential/loss
    v_rep_j_k = qw_rep_j_k * (1 - dist_j_k[repulsive_mask])
    v_rep_k = torch.sum(v_rep_j_k, dim=0)
    v_rep = torch.sum(v_rep_k) / len(unique_oids)

    l_coward = torch.mean(1 - beta[alphas])
    l_noise = torch.mean(beta[~not_noise])

    return {
        "attractive": v_att,
        "repulsive": v_rep,
        "coward": l_coward,
        "noise": l_noise,
    }

In [11]:
import sys

sys.path.append("/home/kl5675/Documents/23/git_sync/object_condensation")
from tests.loss_test_cases import generate_test_data
from tests.test_losses_torch import TorchCondensationMockData

td = generate_test_data()

td = TorchCondensationMockData.from_numpy(td)
cl = condensation_loss(
    beta=td.beta.squeeze(),
    x=td.x,
    object_id=td.object_id.squeeze(),
    weights=td.weights.squeeze(),
    q_min=td.q_min,
    noise_threshold=0,
)
cl

{'attractive': tensor(1.7951, dtype=torch.float64),
 'repulsive': tensor(1.9509, dtype=torch.float64),
 'coward': tensor(0.2157, dtype=torch.float64),
 'noise': tensor(0.7748, dtype=torch.float64)}

In [12]:
print(cl, flush=True)

{'attractive': tensor(1.7951, dtype=torch.float64), 'repulsive': tensor(1.9509, dtype=torch.float64), 'coward': tensor(0.2157, dtype=torch.float64), 'noise': tensor(0.7748, dtype=torch.float64)}


In [14]:
ml = MemLogger()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
ml.log("empty")
out = model(dp)
ml.log("model evaluated")
loss = condensation_loss(
    beta=out["B"],
    x=out["H"],
    object_id=data.particle_id,
    q_min=0.1,
    noise_threshold=0,
    weights=torch.ones_like(data.particle_id),
)
total_loss = loss["attractive"] + loss["repulsive"] + loss["noise"] + loss["coward"]
ml.log("loss evaluated")
optimizer.zero_grad()
total_loss.backward()
ml.log("backward done evaluated")
optimizer.step()
ml.log("step done")

empty                          added     0.17 GB, total     0.17 GB
model evaluated                added    14.75 GB, total    14.92 GB
loss evaluated                 added     3.99 GB, total    18.91 GB
backward done evaluated        added   -18.74 GB, total     0.17 GB
step done                      added     0.02 GB, total     0.18 GB


empty                          added     0.14 GB, total     0.14 GB
model evaluated                added    14.76 GB, total    14.90 GB
loss evaluated                 added     3.87 GB, total    18.77 GB
backward done evaluated        added   -18.60 GB, total     0.17 GB
step done                      added     0.02 GB, total     0.18 GB


empty                          added    25.88 GB, total    25.88 GB
model evaluated                added    14.76 GB, total    40.63 GB
loss evaluated                 added    14.18 GB, total    54.82 GB
backward done evaluated        added   -28.93 GB, total    25.88 GB
step done                      added     0.02 GB, total    25.90 GB