In [2]:
from __future__ import annotations

from pathlib import Path

import jmp.config as JC
import nshconfig_extra as CE
import nshtrainer as nt
import nshutils as nu

cwd = Path("/net/csefiles/coc-fung-cluster/nima/shared/experiment-data/")
env = {
    "HF_HOME": "/net/csefiles/coc-fung-cluster/nima/shared/cache/huggingface",
}

config = JC.Config.draft()

config.name = "jmp-s"
config.project = "mptrj-alex-omat24"

config.pretrained_ckpt = CE.CachedPath(
    uri="/net/csefiles/coc-fung-cluster/nima/shared/checkpoints/jmp-s.pt"
)
config.graph_computer = JC.GraphComputerConfig(
    cutoffs=JC.CutoffsConfig.from_constant(8.0),
    max_neighbors=JC.MaxNeighborsConfig(main=20, aeaint=20, aint=1000, qint=8),
    pbc=True,
    per_graph_radius_graph=True,
)

# Optimization and learning rate scheduling
config.optimization = JC.OptimizationConfig.draft()
config.optimization.optimizer = nt.config.AdamWConfig(lr=8.0e-5, weight_decay=0.001)
config.optimization.separate_lr_multiplier = JC.SeparateLRMultiplierConfig(
    backbone_multiplier=0.25, rest_multiplier=1.0
)
config.optimization.lr_scheduler = nt.config.LinearWarmupCosineDecayLRSchedulerConfig(
    warmup_duration=nt.config.StepsConfig(value=5000),
    warmup_start_lr_factor=0.001,
    max_duration=nt.config.StepsConfig(value=1_000_000),
    min_lr_factor=0.1,
)

# Heads
config.targets = JC.TargetsConfig.draft()
config.targets.energy = JC.EnergyTargetConfig(max_atomic_number=120)
config.targets.force = JC.ForceTargetConfig()
config.targets.stress = JC.StressTargetConfig(num_layers=5)
config.targets.energy_loss_coefficient = 1.0
config.targets.force_loss_coefficient = 10.0
config.targets.stress_loss_coefficient = 100.0

config.trainer.precision = "16-mixed-auto"
config.trainer.set_float32_matmul_precision = "medium"
config.trainer.optimizer.log_grad_norm = True
config.trainer.optimizer.gradient_clipping = nt.config.GradientClippingConfig(
    value=100.0, algorithm="norm"
)

config.with_project_root_(cwd)
config = config.finalize()
nu.display(config)

data_config = JC.MPTrjAlexOMAT24DataModuleConfig.draft()
data_config.batch_size = 120
data_config.num_workers = 8
data_config.salex.local_path = Path("/storage/nima/salex-ocp/hf/")
data_config.omat24.local_path = Path("/storage/nima/omat24/hf/")
data_config.with_linear_reference_("mptrj-salex")
data_config = data_config.finalize()
nu.display(data_config)

In [3]:
from jmp.lightning_datamodule import MPTrjAlexOMAT24DataModule
from jmp.lightning_module import Module


def run(config: JC.Config, data_config: JC.MPTrjAlexOMAT24DataModuleConfig):
    module = Module(config)
    datamodule = MPTrjAlexOMAT24DataModule(data_config)
    trainer = nt.Trainer(config)
    trainer.fit(module, datamodule)

In [7]:
import nshrunner as nr

configs = [(config.fast_dev_run(256), data_config)]

runner = nr.Runner(run, nr.RunnerConfig(working_dir=cwd, env=env))
runner.local(configs)

  0%|          | 0/1 [00:00<?, ?it/s]

Seed set to 0
INFO:jmp.models.gemnet.layers.radial_basis_dynamic_cutoff:[RadialBasis] Using absolute cutoff of 12.0 Angstroms.
INFO:jmp.models.gemnet.layers.radial_basis_dynamic_cutoff:[RadialBasis] Using absolute cutoff of 12.0 Angstroms.
INFO:jmp.models.gemnet.layers.radial_basis_dynamic_cutoff:[RadialBasis] Using absolute cutoff of 12.0 Angstroms.
INFO:jmp.models.gemnet.layers.radial_basis_dynamic_cutoff:[RadialBasis] Using absolute cutoff of 12.0 Angstroms.


Unrecognized arguments:  dict_keys(['ln', 'dropout', 'replace_scale_factors_with_ln', 'learnable_rbf', 'learnable_rbf_stds', 'unique_basis_per_layer', 'old_gaussian_implementation', 'edge_dropout'])


INFO:jmp.models.gemnet.layers.radial_basis_dynamic_cutoff:[RadialBasis] Using absolute cutoff of 12.0 Angstroms.
INFO:jmp.models.gemnet.layers.radial_basis_dynamic_cutoff:[RadialBasis] Using absolute cutoff of 12.0 Angstroms.
INFO:jmp.models.gemnet.layers.radial_basis_dynamic_cutoff:[RadialBasis] Using absolute cutoff of 12.0 Angstroms.
CRITICAL:root:Found the following scale factors: [('int_blocks.0.trip_interaction.scale_rbf', 'int_blocks.0.trip_interaction.scale_rbf'), ('int_blocks.0.trip_interaction.scale_cbf_sum', 'int_blocks.0.trip_interaction.scale_cbf_sum'), ('int_blocks.0.quad_interaction.scale_rbf', 'int_blocks.0.quad_interaction.scale_rbf'), ('int_blocks.0.quad_interaction.scale_cbf', 'int_blocks.0.quad_interaction.scale_cbf'), ('int_blocks.0.quad_interaction.scale_sbf_sum', 'int_blocks.0.quad_interaction.scale_sbf_sum'), ('int_blocks.0.atom_edge_interaction.scale_rbf', 'int_blocks.0.atom_edge_interaction.scale_rbf'), ('int_blocks.0.atom_edge_interaction.scale_cbf_sum', 'int

Loading dataset from disk:   0%|          | 0/212 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/212 [00:00<?, ?it/s]

CRITICAL:nshtrainer.callbacks.debug_flag:Fast dev run detected, setting debug flag to True.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
Loading `train_dataloader` to estimate number of stepping batches.


Loading dataset from disk:   0%|          | 0/212 [00:00<?, ?it/s]


  | Name           | Type              | Params | Mode 
-------------------------------------------------------------
0 | backbone       | GemNetOCBackbone  | 38.9 M | train
1 | energy_head    | EnergyOutputHead  | 263 K  | train
2 | force_head     | ForceOutputHead   | 1.1 M  | train
3 | stress_head    | StressOutputHead  | 2.1 M  | train
4 | graph_computer | GraphComputer     | 0      | train
5 | train_metrics  | ForceFieldMetrics | 0      | train
6 | val_metrics    | ForceFieldMetrics | 0      | train
7 | test_metrics   | ForceFieldMetrics | 0      | train
-------------------------------------------------------------
42.3 M    Trainable params
0         Non-trainable params
42.3 M    Total params
169.161   Total estimated model params size (MB)
INFO:nshtrainer.trainer.signal_connector:No auto-requeue signals found. Reverting to default Lightning behavior.


Loading dataset from disk:   0%|          | 0/212 [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

/net/csefiles/coc-fung-cluster/nima/miniforge3/envs/jmp-peft/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [4]:
import nshrunner as nr

configs = [(config, data_config)]

runner = nr.Runner(run, nr.RunnerConfig(working_dir=cwd, env=env))
runner.session(configs, snapshot=True, env=env)

Resolved the following modules from the provided values: ['jmp']
Snapshotting modules=['jmp'] to /net/csefiles/coc-fung-cluster/nima/shared/experiment-data/nshrunner/d3cb659c-513c-40f5-aa8e-d34c36e8e84e/nshsnap
Run the following command to submit the jobs:




screen -S nshrunner -L -Logfile /net/csefiles/coc-fung-cluster/nima/shared/experiment-data/nshrunner/d3cb659c-513c-40f5-aa8e-d34c36e8e84e/submit/logs/session.log -U bash /net/csefiles/coc-fung-cluster/nima/shared/experiment-data/nshrunner/d3cb659c-513c-40f5-aa8e-d34c36e8e84e/submit/submit.sh




Submission(command_parts=['screen', '-S', 'nshrunner', '-L', '-Logfile', '/net/csefiles/coc-fung-cluster/nima/shared/experiment-data/nshrunner/d3cb659c-513c-40f5-aa8e-d34c36e8e84e/submit/logs/session.log', '-U', 'bash', '/net/csefiles/coc-fung-cluster/nima/shared/experiment-data/nshrunner/d3cb659c-513c-40f5-aa8e-d34c36e8e84e/submit/submit.sh'], script_path=PosixPath('/net/csefiles/coc-fung-cluster/nima/shared/experiment-data/nshrunner/d3cb659c-513c-40f5-aa8e-d34c36e8e84e/submit/submit.sh'))