In [1]:
import GPTNeoXColab
import os
from pathlib import Path
ROOT_DIR = GPTNeoXColab.utils.colab.find_project_root()
RELATIVE_ROOT_DIR = os.path.relpath(ROOT_DIR, Path.cwd())

In [2]:
experiment_name = "experiment1"  # Change this to dynamically load different experiments

In [None]:
def get_or_create_experiment_id(name):
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
        exp_id = mlflow.create_experiment(name)
        return exp_id
    return exp.experiment_id

In [5]:
import os
import dagshub
import mlflow
from omegaconf import DictConfig, OmegaConf
from hydra.core.global_hydra import GlobalHydra
from hydra import initialize, compose

# Clear Hydra's global state if it’s already initialized
if GlobalHydra.instance().is_initialized():
    GlobalHydra.instance().clear()

initialize(config_path=f"{RELATIVE_ROOT_DIR}/configs", version_base="1.1")

cfg = compose(config_name="hydra", overrides=[f"experiments={experiment_name}"])

# Set MLflow tracking URI for DagsHub
tracking_uri = f"https://dagshub.com/{cfg.dagshub.repo_owner}/{cfg.dagshub.repo_name}.mlflow"
mlflow.set_tracking_uri(tracking_uri)

# Initialize DagsHub logging
try:
    dagshub.init(repo_owner=cfg.dagshub.repo_owner, repo_name=cfg.dagshub.repo_name, mlflow=True)
except Exception as e:
    print(f"Failed to initialize DagsHub logging: {e}")

def train_model(cfg: DictConfig):
    print("Running experiment:", cfg.experiment_name)
    print(OmegaConf.to_yaml(cfg))

    # Log parameters
    mlflow.log_params(OmegaConf.to_container(cfg, resolve=True))

    # Example dummy training loop
    for epoch in range(10):
        loss = 0.4 - epoch * 0.01  # Dummy decreasing loss
        accuracy = epoch * 0.1     # Dummy increasing accuracy
        mlflow.log_metric("train_loss", loss, step=epoch)
        mlflow.log_metric("train_accuracy", accuracy, step=epoch)

    print("Training complete.")

exp_id = get_or_create_experiment_id("tutorial")

# Set the experiment name and start the run
# mlflow.set_experiment(experiment_name)
with mlflow.start_run(experiment_id=exp_id):
    train_model(cfg)


Running experiment: base_experiment
dagshub:
  repo_owner: MarkNZed
  repo_name: GPT-NeoX-Colab
output_dir: ../outputs
experiment_name: base_experiment
seed: 42
model:
  hidden_dim: 64
training:
  batch_size: 32
  learning_rate: 0.001
  epochs: 2
data: null
experiments:
  experiment_name: experiment_1
  seed: 123

Training complete.


2024/11/10 13:46:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run flawless-eel-650 at: https://dagshub.com/MarkNZed/GPT-NeoX-Colab.mlflow/#/experiments/2/runs/901b2fb3c2c44f8a8d77f4657dd7ff21.
2024/11/10 13:46:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/MarkNZed/GPT-NeoX-Colab.mlflow/#/experiments/2.


In [6]:
env

{'HOSTNAME': '619e00883352',
 'HOME': '/home/vscode',
 'PYTHONUNBUFFERED': '1',
 'GPG_KEY': '<hidden>',
 'BB_ACCESS_KEY_RW_ID': '<hidden>',
 'PYTHON_SHA256': 'aab0950817735172601879872d937c1e4928a57c409ae02369ec3d91dccebe79',
 'BB_BUCKET': 'GPT-NeoX-Colab',
 'PYTHONDONTWRITEBYTECODE': '1',
 'BB_ENDPOINT': 'https://s3.eu-central-003.backblazeb2.com',
 'PATH': '/usr/local/bin:/home/vscode/.local/bin:/vscode/vscode-server/bin/linux-x64/384ff7382de624fb94dbaf6da11977bba1ecd427/bin/remote-cli:/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/vscode/vscode-server/bin/linux-x64/384ff7382de624fb94dbaf6da11977bba1ecd427/bin/remote-cli:/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
 'LANG': 'C.UTF-8',
 'SHELL': '/bin/sh',
 'AWS_ACCESS_KEY_ID': '<hidden>',
 'PYTHON_VERSION': '3.10.15',
 'AWS_SECRET_ACCESS_KEY': '<hidden>',
 'PWD': '/vscode/vscode-server/bin/linux-x64/384ff7382de624fb94dbaf6da11977bba1ecd427',
 'VSCODE_CWD': '/vscode/vscode