In [6]:
import sys
import os

# Define the absolute working directory
base_path = '/kaggle/working'
project_name = 'Double-Pendulum-Simulation'
project_root = os.path.join(base_path, project_name)

# 1. Clone or Pull
if not os.path.exists(project_root):
    os.chdir(base_path)
    !git clone https://github.com/nthday-jpg/Double-Pendulum-Simulation.git
    print("Repository cloned successfully!")
else:
    os.chdir(project_root)
    !git pull
    print("Repository updated successfully!")

# 2. Correctly set the Python Path
# It is vital to add the absolute path to the project root
if project_root not in sys.path:
    sys.path.insert(0, project_root)

os.chdir(base_path)

# 4. Imports
try:
    import torch
    from models.pinn import PINN
    from training.trainer import Trainer
    from data.dataset import get_dataloader
    from utils.config import Config
    print("All modules imported successfully!")
except ImportError as e:
    print(f"Import failed: {e}")
    print(f"Current sys.path: {sys.path}")

Already up to date.
Repository updated successfully!
All modules imported successfully!


In [8]:
cfg = Config(
    hidden_dims=[64, 64],  
    input_dim=1,
    output_dim=2,
    residual_type="lagrangian",
    t_max=5, t_min=0,
    epochs=1000,
    early_stopping_patience = 50,
    batch_size = 128,
    batch_size_collocation = 512,
    use_compile=False
)

input_path = "/kaggle/input/double-pendulum"

In [9]:
from accelerate import notebook_launcher
def train_func(config):  
    model = PINN(cfg)

    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr)
    data_loader, colloc_loader, val_loader = get_dataloader(
        data_path=f"{input_path}/trajectory_000.npz",
        parameters_path=f"{input_path}/parameters_000.json",
        config=cfg
    )
    trainer = Trainer(
        model=model,
        config=cfg,
        data_loader=data_loader,
        collocation_loader=colloc_loader,
        val_loader=val_loader,
        optimizer=optimizer
    )

    trainer.train()

In [10]:
notebook_launcher(train_func, args=(cfg,), num_processes=2)

Launching training on 2 CUDAs.
DataLoaders: data_bs=128, colloc_bs=512, workers=4DataLoaders: data_bs=128, colloc_bs=512, workers=4

Starting training for 1000 epochs...
Device: cuda:0
Data batch: 128, Collocation batch: 512


grad.sizes() = [64, 1], strides() = [1, 64]
bucket_view.sizes() = [64, 1], strides() = [1, 1] (Triggered internally at /pytorch/torch/csrc/distributed/c10d/reducer.cpp:334.)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
grad.sizes() = [64, 1], strides() = [1, 64]
bucket_view.sizes() = [64, 1], strides() = [1, 1] (Triggered internally at /pytorch/torch/csrc/distributed/c10d/reducer.cpp:334.)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


  Processing data batch 10...

ERROR at epoch 1
Process rank: 0
Error type: ValueError
Error: too many values to unpack (expected 2)
Traceback:
Traceback (most recent call last):
  File "/kaggle/working/Double-Pendulum-Simulation/training/trainer.py", line 124, in train
    avg_val_loss = self.evaluate(self.val_loader)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/kaggle/working/Double-Pendulum-Simulation/training/trainer.py", line 262, in evaluate
    t, state, point_type = batch  # Changed from t, state = batch
    ^^^^^^^^
ValueError: too many values to unpack (expected 2)


ERROR at epoch 1
Process rank: 1
Error type: ValueError
Error: too many values to unpack (expected 2)
Traceback:
Traceback (most recent call last):
  File "/kaggle/working/Double-Pendulum-Simulation/training/trainer.py", line 124, in train
    avg_val_loss = self.evaluate(self.val_loader)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/kaggle/working/Double-Pendulum-Simulation/training/tr

W0108 16:55:11.919000 55 torch/multiprocessing/spawn.py:169] Terminating process 337 via signal SIGTERM
E0108 16:55:12.053000 55 torch/distributed/elastic/multiprocessing/api.py:737] failed (exitcode: 1) local_rank: 1 (pid: 338) of fn: train_func (start_method: fork)
E0108 16:55:12.053000 55 torch/distributed/elastic/multiprocessing/api.py:737] Traceback (most recent call last):
E0108 16:55:12.053000 55 torch/distributed/elastic/multiprocessing/api.py:737]   File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/api.py", line 692, in _poll
E0108 16:55:12.053000 55 torch/distributed/elastic/multiprocessing/api.py:737]     self._pc.join(-1)
E0108 16:55:12.053000 55 torch/distributed/elastic/multiprocessing/api.py:737]   File "/usr/local/lib/python3.12/dist-packages/torch/multiprocessing/spawn.py", line 215, in join
E0108 16:55:12.053000 55 torch/distributed/elastic/multiprocessing/api.py:737]     raise ProcessRaisedException(msg, error_index, failed_proce

ChildFailedError: 
============================================================
train_func FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2026-01-08_16:55:11
  host      : cf75323b4399
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 338)
  error_file: /tmp/torchelastic_alemeqyu/none_tyycu9nj/attempt_0/1/error.json
  traceback : Traceback (most recent call last):
    File "/usr/local/lib/python3.12/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
      return f(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^
    File "/tmp/ipykernel_55/2445852004.py", line 20, in train_func
      trainer.train()
    File "/kaggle/working/Double-Pendulum-Simulation/training/trainer.py", line 124, in train
      avg_val_loss = self.evaluate(self.val_loader)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/kaggle/working/Double-Pendulum-Simulation/training/trainer.py", line 262, in evaluate
      t, state, point_type = batch  # Changed from t, state = batch
      ^^^^^^^^
  ValueError: too many values to unpack (expected 2)
  
============================================================

In [None]:
!zip -r /kaggle/working/runs 