In [1]:
import main_fair_chem
import argparse
from fairchem.core.common.utils import (
    new_trainer_context,
)
from OptunaTrained import OptunaTrained
from fairchem.core.tasks.task import BaseTask
import optuna
from fairchem.core.common.registry import registry
from optuna.trial import TrialState


@registry.register_task("validate")
class OptunaTasks(BaseTask):
    def run(self, trial) -> None:
        try:
            self.trainer.train(
                trial
            )
        except RuntimeError as e:
            self._process_error(e)
            raise e


def model_hyperparameters(trial, model):
    model.num_layers = trial.suggest_categorical("num_layers", [ 6, 8, 10, 12])

    model.max_neighbors = trial.suggest_int("max_neighbors", 15, 25)
    model.attn_hidden_channels = trial.suggest_categorical("attn_hidden_channels", [64,96])


def main():
    study_name = "equiformer_v2"

    def evaluation_function(trial):
        main_fair_chem.setup_logging()
        parser: argparse.ArgumentParser = main_fair_chem.flags.get_parser()
        args = parser.parse_args(["--mode", "validate", "--config-yml", "config_files/equiformer_v2/equiformer_v2_N@12_L@6_M@2.yml"])

        # args, override_args = parser.parse_known_args()

        config = main_fair_chem.build_config(args, {})
        with new_trainer_context(config=config) as ctx:
            config = ctx.config
            task = ctx.task
            trainer = ctx.trainer
        task.setup(trainer)
        model_hyperparameters(trial, task.trainer.model)
        task.run(trial)
        return task.trainer.best_val_metric


    print("running")
    study = optuna.create_study(
        direction="minimize",
        storage="sqlite:///optuna.db",
        study_name=study_name,
        load_if_exists=True,
    )
    study.optimize(evaluation_function, n_trials=2, timeout=600)

    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))
    print("Best trial:")
    trial = study.best_trial
    print("  Value: ", trial.value)
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-09-14 00:05:12,570] Using an existing study with name 'equiformer_v2' instead of creating a new one.


running




2024-09-14 00:05:12 (INFO): Setting env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
2024-09-14 00:05:12 (INFO): Project root: /media/mohammed/Work/anaconda3/envs/fair-chem/lib/python3.11/site-packages/fairchem
2024-09-14 00:05:13 (INFO): amp: false
cmd:
  checkpoint_dir: ./checkpoints/2024-09-14-00-04-48
  commit: null
  identifier: ''
  logs_dir: ./logs/wandb/2024-09-14-00-04-48
  print_every: 10
  results_dir: ./results/2024-09-14-00-04-48
  seed: 0
  timestamp_id: 2024-09-14-00-04-48
  version: 1.1.0
dataset:
  a2g_args:
    r_data_keys:
    - S1_exc
    r_energy: false
    r_forces: false
  format: ase_db
  include_relaxed_energy: false
  keep_in_memory: false
  key_mapping:
    S1_exc: energy
  src: /media/mohammed/Work/FORMED_ML/train/formed_train.db
evaluation_metrics:
  metrics:
    energy:
    - mae
    - mse
  primary_metric: energy_mae
gp_gpus: null
gpus: 1
logger: wandb
loss_functions:
- energy:
    fn: mae
model:
  alpha_drop: 0.1
  attn_activation: silu
  attn_alpha_

2024-09-14 00:05:13 (ERROR): Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmohammed-azzouzi15[0m ([33mazzouzi_lab[0m). Use [1m`wandb login --relogin`[0m to force relogin


2024-09-14 00:05:14 (INFO): Loading dataset: ase_db




2024-09-14 00:05:20 (INFO): rank: 0: Sampler created...
2024-09-14 00:05:20 (INFO): Created BalancedBatchSampler with sampler=<fairchem.core.common.data_parallel.StatefulDistributedSampler object at 0x7b5056601cd0>, batch_size=4, drop_last=False




2024-09-14 00:05:20 (INFO): rank: 0: Sampler created...
2024-09-14 00:05:20 (INFO): Created BalancedBatchSampler with sampler=<fairchem.core.common.data_parallel.StatefulDistributedSampler object at 0x7b50567ea290>, batch_size=4, drop_last=False




2024-09-14 00:05:21 (INFO): rank: 0: Sampler created...
2024-09-14 00:05:21 (INFO): Created BalancedBatchSampler with sampler=<fairchem.core.common.data_parallel.StatefulDistributedSampler object at 0x7b504ef34110>, batch_size=4, drop_last=False
2024-09-14 00:05:21 (INFO): Loading model: equiformer_v2
2024-09-14 00:05:22 (INFO): Loaded EquiformerV2 with 77182465 parameters.
{'energy': {'fn': 'mae'}}
2024-09-14 00:05:22 (INFO): Parameters without weight decay:
2024-09-14 00:05:22 (INFO): ['edge_degree_embedding.rad_func.net.0.bias', 'edge_degree_embedding.rad_func.net.1.weight', 'edge_degree_embedding.rad_func.net.1.bias', 'edge_degree_embedding.rad_func.net.3.bias', 'edge_degree_embedding.rad_func.net.4.weight', 'edge_degree_embedding.rad_func.net.4.bias', 'edge_degree_embedding.rad_func.net.6.bias', 'blocks.0.norm_1.affine_weight', 'blocks.0.norm_1.norm_l0.weight', 'blocks.0.norm_1.norm_l0.bias', 'blocks.0.ga.so2_conv_1.fc_m0.bias', 'blocks.0.ga.so2_conv_1.rad_func.net.0.bias', 'block



2024-09-14 00:05:31 (INFO): energy_mae: 3.03e+00, energy_mse: 1.09e+01, loss: 3.03e+00, lr: 8.53e-05, epoch: 1.69e-03, step: 4.00e+01
2024-09-14 00:05:34 (INFO): energy_mae: 2.28e+00, energy_mse: 6.27e+00, loss: 2.28e+00, lr: 8.66e-05, epoch: 2.12e-03, step: 5.00e+01
2024-09-14 00:05:36 (INFO): energy_mae: 2.53e+00, energy_mse: 7.73e+00, loss: 2.53e+00, lr: 8.80e-05, epoch: 2.54e-03, step: 6.00e+01
2024-09-14 00:05:38 (INFO): energy_mae: 1.98e+00, energy_mse: 5.34e+00, loss: 1.98e+00, lr: 8.94e-05, epoch: 2.96e-03, step: 7.00e+01
2024-09-14 00:05:40 (INFO): energy_mae: 2.11e+00, energy_mse: 5.80e+00, loss: 2.11e+00, lr: 9.07e-05, epoch: 3.39e-03, step: 8.00e+01
2024-09-14 00:05:42 (INFO): energy_mae: 1.89e+00, energy_mse: 5.41e+00, loss: 1.89e+00, lr: 9.21e-05, epoch: 3.81e-03, step: 9.00e+01
2024-09-14 00:05:44 (INFO): energy_mae: 2.07e+00, energy_mse: 5.66e+00, loss: 2.07e+00, lr: 9.34e-05, epoch: 4.23e-03, step: 1.00e+02
2024-09-14 00:05:46 (INFO): energy_mae: 1.88e+00, energy_mse: 

[W 2024-09-14 00:09:22,009] Trial 7 failed with parameters: {'num_layers': 10, 'max_neighbors': 17, 'attn_hidden_channels': 96} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/media/mohammed/Work/anaconda3/envs/fair-chem/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_157041/4069694961.py", line 49, in evaluation_function
    task.run(trial)
  File "/tmp/ipykernel_157041/4069694961.py", line 17, in run
    self.trainer.train(
  File "/media/mohammed/Work/anaconda3/envs/fair-chem/lib/python3.11/site-packages/fairchem/core/trainers/ocp_trainer.py", line 157, in train
    out = self._forward(batch)
          ^^^^^^^^^^^^^^^^^^^^
  File "/media/mohammed/Work/anaconda3/envs/fair-chem/lib/python3.11/site-packages/fairchem/core/trainers/ocp_trainer.py", line 246, in _forward
    out = self.model(batch.to(self.device))
    

KeyboardInterrupt: 