### Set autoreloading
This extension will automatically update with any changes to packages in real time

In [1]:
%load_ext autoreload
%autoreload 2

### Import packages
We'll need the `pytorch_lightning` and `nugraph` packages imported in order to train

In [2]:
import os
import sys
sys.path.append(f"{os.environ['HOME']}/nugraph/pynuml")
sys.path.append(f"{os.environ['HOME']}/nugraph/nugraph")
os.environ["NUGRAPH_DIR"] = f"{os.environ['HOME']}/nugraph"
os.environ["NUGRAPH_LOG"] = f"{os.environ['HOME']}/logs"
os.environ["NUGRAPH_DATA"] = f"{os.environ['HOME']}/data"
from pathlib import Path
import nugraph as ng
import pytorch_lightning as pl

### Configure data module
Declare a data module. Depending on where you're working, you should edit the data path below to point to a valid data location.

In [3]:
#nudata = ng.data.H5DataModule()
#nudata = ng.data.H5DataModule(data_path="/exp/dune/data/users/hrazafin/iceberg/merged_run9_hdf5/processed/run9_sg_bg.h5")
nudata = ng.data.H5DataModule(data_path="/exp/dune/data/users/hrazafin/iceberg/run9_hdf5/run9_signal_extra_Sep2125/v5_merge_run9.h5")
#nudata = ng.data.H5DataModule(data_path="/exp/dune/data/users/hrazafin/iceberg/merged_run9_hdf5/sg_backg_inputs/merged/run9_signal_proc.h5")

### Configure network
Declare a model. You can edit the arguments below to change the network configuration.

In [4]:
nugraph = ng.models.NuGraph3(
    in_features=6, #default is 5
    hit_features=128,
    nexus_features=32,
    instance_features=32,
    interaction_features=32,
    semantic_classes=nudata.semantic_classes,
    event_classes=nudata.event_classes,
    num_iters=5,
    event_head=False,
    semantic_head=True,
    filter_head=True,
    vertex_head=False,
    instance_head=True,
    use_checkpointing=True,
    lr=0.001)

### Configure logger and callbacks
Declare a tensorboard logger and define the output directory, so we can monitor network training. Also define a callback so we can monitor learning rate evolution.

In [5]:
name = "test"
logdir = Path(os.environ["NUGRAPH_LOG"])/name
logdir.mkdir(parents=True, exist_ok=True)
logger = pl.loggers.WandbLogger(save_dir=logdir, project="nugraph3", name="test1",
                                log_model="all")
callbacks = [
    pl.callbacks.LearningRateMonitor(logging_interval="step"),
    pl.callbacks.ModelCheckpoint(monitor="loss/val", mode="min"),
]

### Declare trainer and run training
First we set the training device. To train with a GPU, pass an integer  otherwise, it defaults to CPU training. We then instantiate a PyTorch Lightning trainer that we'll use for training, and then run the training stage, which iterates over all batches in the train and validation datasets to optimise model parameters, writing output metrics to tensorboard.

In [None]:
accelerator, devices = ng.util.configure_device()
trainer = pl.Trainer(accelerator="gpu",
                     devices=1,
                     max_epochs=80,
                     logger=logger,
                     callbacks=callbacks,
                     )
trainer.fit(nugraph, datamodule=nudata)
trainer.test(datamodule=nudata)

In [6]:
accelerator, devices = ng.util.configure_device()
trainer = pl.Trainer(accelerator=accelerator,
                     devices=devices,
                     max_epochs=1,
                     logger=logger,
                     callbacks=callbacks)
trainer.fit(nugraph, datamodule=nudata)
trainer.test(datamodule=nudata)

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/hrazafin/.conda/envs/numl0725/lib/python3.10/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mherilala-razafi[0m ([33mherilala-razafi-university-of-cincinnati[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Loading `train_dataloader` to estimate number of stepping batches.

  | Name             | Type            | Params | Mode 
-------------------------------------------------------------
0 | encoder          | Encoder         | 909    | train
1 | core_net         | NuGraphCore     | 99.7 K | train
2 | semantic_decoder | SemanticDecoder | 1.8 K  | train
3 | filter_decoder   | FilterDecoder   | 130    | train
4 | instance_decoder | InstanceDecoder | 4.3 K  | train
-------------------------------------------------------------
106 K     Trainable params
13        Non-trainable params
106 K     Total params
0.427     Total estimated model params size (MB)
73        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

TypeError: Caught TypeError in DataLoader worker process 1.
Original Traceback (most recent call last):
  File "/home/hrazafin/.conda/envs/numl0725/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/hrazafin/.conda/envs/numl0725/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/hrazafin/.conda/envs/numl0725/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/hrazafin/.conda/envs/numl0725/lib/python3.10/site-packages/torch_geometric/data/dataset.py", line 291, in __getitem__
    data = self.get(self.indices()[idx])
  File "/home/hrazafin/nugraph/nugraph/nugraph/data/dataset.py", line 29, in get
    return NuGraphData.load(self.file[key])
  File "/home/hrazafin/.conda/envs/numl0725/lib/python3.10/site-packages/pynuml/data/nugraph_data.py", line 91, in load
    for dataset in group.dtype.names:
TypeError: 'NoneType' object is not iterable


