# Experiment
This is a demonstration of how experiments can be run using DagsHub and MLflow.
We will train three different versions of the tiny LLM using different batch sizes and compare the results. 

In [None]:
# We could modify these paths to "stub" behavior for test/dev
# Use functions so if the Notebook is restarted we do not lose the values
def workspaceDir():
    return "/content"
def GPTNeoXDirName():
    return "gpt-neox"
def GPTNeoXDir():
    return f"{workspaceDir()}/{GPTNeoXDirName()}"
def GPTNeoXColabDirName():
    return "GPT-NeoX-Colab"
def GPTNeoXColabDir():
    return f"{workspaceDir()}/{GPTNeoXColabDirName()}"

In [1]:
import GPTNeoXColab
import os
from pathlib import Path
ROOT_DIR = GPTNeoXColab.utils.colab.find_project_root()
RELATIVE_ROOT_DIR = os.path.relpath(ROOT_DIR, Path.cwd())

In [None]:
#@title Clone GPT-NeoX-Colab
%%time
%cd {workspaceDir()}
# Don't use --depth 1 because that does not play nice with git-annex
!git clone https://github.com/markNZed/GPT-NeoX-Colab.git
%cd {GPTNeoXColabDir()}
%pip install -q -r requirements_colab.txt
%pip install -q .
from dotenv import load_dotenv
import os
load_dotenv(f"{GPTNeoXColabDir()}/.env")
import GPTNeoXColab
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare_text_document.bin")
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare_text_document.idx")

In [None]:
#@title Clone GPT-NeoX
%%time
%cd {workspaceDir()}
!git clone --depth 1 https://github.com/EleutherAI/gpt-neox

In [None]:
#@title Load prebuilt Python environment for Colab
%%time
%cd {workspaceDir()}
GPTNeoXColab.utils.colab.download_my_env()

In [2]:
experiment_name = "experiment1"  # Change this to dynamically load different experiments

In [5]:
#@title Run the training in a detached background process
import subprocess
import os
import dagshub
import mlflow
from omegaconf import DictConfig, OmegaConf
from hydra.core.global_hydra import GlobalHydra
from hydra import initialize, compose

# Clear Hydra's global state if it’s already initialized
if GlobalHydra.instance().is_initialized():
    GlobalHydra.instance().clear()

initialize(config_path=f"{RELATIVE_ROOT_DIR}/configs", version_base="1.1")

cfg = compose(config_name="hydra", overrides=[f"experiments={experiment_name}"])

# Set MLflow tracking URI for DagsHub
#MLFLOW_TRACKING_URI = f"https://dagshub.com/{cfg.dagshub.repo_owner}/{cfg.dagshub.repo_name}.mlflow"

os.environ["DAGSHUB_USER"] = cfg.dagshub.repo_owner
os.environ["DAGSHUB_TOKEN"] = ""

# Initialize DagsHub logging
try:
    # Will setup MLFLOW_TRACKING_URI MLFLOW_TRACKING_USERNAME MLFLOW_TRACKING_PASSWORD
    dagshub.init(repo_owner=cfg.dagshub.repo_owner, repo_name=cfg.dagshub.repo_name, mlflow=True)
except Exception as e:
    print(f"Failed to initialize DagsHub logging: {e}")

def train_model(cfg: DictConfig):
    print("Running experiment:", cfg.experiment_name)
    print(OmegaConf.to_yaml(cfg))

    # Log parameters
    mlflow.log_params(OmegaConf.to_container(cfg, resolve=True))


    # The deepy.py script assumes it is running in the root of GTP-NeoX repo
    # Start a detached background process
    process = subprocess.Popen(
        f"nohup bash -c \"source {workspaceDir()}/my_env/bin/activate && python ./deepy.py train.py --conf_dir {GPTNeoXColabDir()}/configs shakespeare shakespeare_deepy\" & echo $! > train_process.pid",
        shell=True,
        executable='/bin/bash',
        preexec_fn=subprocess.os.setsid  # Starts the process in a new session so interrupting Notebook does not kill the training
    )

    print("Training complete.")

exp_id = GPTNeoXColab.utils.ml.get_or_create_experiment_id("tutorial")

# Set the experiment name and start the run
# mlflow.set_experiment(experiment_name)
with mlflow.start_run(experiment_id=exp_id):
    train_model(cfg)


Running experiment: base_experiment
dagshub:
  repo_owner: MarkNZed
  repo_name: GPT-NeoX-Colab
output_dir: ../outputs
experiment_name: base_experiment
seed: 42
model:
  hidden_dim: 64
training:
  batch_size: 32
  learning_rate: 0.001
  epochs: 2
data: null
experiments:
  experiment_name: experiment_1
  seed: 123

Training complete.


2024/11/10 14:11:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run auspicious-cat-512 at: https://dagshub.com/MarkNZed/GPT-NeoX-Colab.mlflow/#/experiments/3/runs/0f5ed2de66614337ac7abfcc92199a0d.
2024/11/10 14:11:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/MarkNZed/GPT-NeoX-Colab.mlflow/#/experiments/3.


In [None]:
# Here we could disconnect from the GPU resource
#from google.colab import runtime
#runtime.unassign()

In [6]:
# An example of creating a Data Pipeline diagram for DagsHub repo
# https://dagshub.com/MarkNZed/GPT-NeoX-Colab/src/dvc

dvc_yaml_content = """
stages:
  preprocess:
    cmd: python preprocess_script.py
    deps:
      - data/raw_data.csv
      - preprocess_script.py
    outs:
      - data/processed_data.csv

  train:
    cmd: python train_script.py
    deps:
      - data/processed_data.csv
      - train_script.py
    outs:
      - models/model.pkl
    params:
      - training.learning_rate

  evaluate:
    cmd: python evaluate_script.py
    deps:
      - models/model.pkl
      - evaluate_script.py
    outs:
      - reports/metrics.json
"""

# Write the YAML content to dvc.yaml
#with open(f"{ROOT_DIR}/dvc.yaml", "w") as f:
#    f.write(dvc_yaml_content)
