In [12]:
import os
from google.colab import userdata
os.environ["MLFLOW_TRACKING_PASSWORD"] = userdata.get("MLFLOW_TRACKING_PASSWORD")
if not os.environ["MLFLOW_TRACKING_PASSWORD"]:
  dagshub.auth.get_token()


# Experiment
This is a demonstration of how experiments can be run using DagsHub and MLflow.
We will train three different versions of the tiny LLM using different batch sizes and compare the results.

In [1]:
# We could modify these paths to "stub" behavior for test/dev
# Use functions so if the Notebook is restarted we do not lose the values
def workspaceDir():
    return "/content"
def GPTNeoXDirName():
    return "gpt-neox"
def GPTNeoXDir():
    return f"{workspaceDir()}/{GPTNeoXDirName()}"
def GPTNeoXColabDirName():
    return "GPT-NeoX-Colab"
def GPTNeoXColabDir():
    return f"{workspaceDir()}/{GPTNeoXColabDirName()}"

In [2]:
#@title Clone GPT-NeoX-Colab
%%time
%cd {workspaceDir()}
# Don't use --depth 1 because that does not play nice with git-annex
!git clone https://github.com/markNZed/GPT-NeoX-Colab.git
%cd {GPTNeoXColabDir()}
%pip install -q -r requirements_colab.txt
%pip install -q .
from dotenv import load_dotenv
import os
load_dotenv(f"{GPTNeoXColabDir()}/.env")
import GPTNeoXColab
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare_text_document.bin")
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare_text_document.idx")

/content
Cloning into 'GPT-NeoX-Colab'...
remote: Enumerating objects: 1116, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 1116 (delta 27), reused 34 (delta 14), pack-reused 1052 (from 1)[K
Receiving objects: 100% (1116/1116), 13.77 MiB | 19.59 MiB/s, done.
Resolving deltas: 100% (604/604), done.
/content/GPT-NeoX-Colab
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86

In [3]:
#@title Clone GPT-NeoX
%%time
%cd {workspaceDir()}
!git clone --depth 1 https://github.com/EleutherAI/gpt-neox

/content
Cloning into 'gpt-neox'...
remote: Enumerating objects: 296, done.[K
remote: Counting objects: 100% (296/296), done.[K
remote: Compressing objects: 100% (231/231), done.[K
remote: Total 296 (delta 74), reused 136 (delta 43), pack-reused 0 (from 0)[K
Receiving objects: 100% (296/296), 2.50 MiB | 6.27 MiB/s, done.
Resolving deltas: 100% (74/74), done.
CPU times: user 17.3 ms, sys: 657 µs, total: 18 ms
Wall time: 1.02 s


In [4]:
#@title Load prebuilt Python environment for Colab
%%time
%cd {workspaceDir()}
GPTNeoXColab.utils.colab.download_my_env()

/content
Downloading my_env.tar.gz
Unzipping my_env.tar.gz
Untarring my_env.tar.gz
CPU times: user 10.5 s, sys: 8.51 s, total: 19 s
Wall time: 2min 30s


In [5]:
import GPTNeoXColab
import os
from pathlib import Path
ROOT_DIR = GPTNeoXColab.utils.colab.find_project_root()
RELATIVE_ROOT_DIR = os.path.relpath(ROOT_DIR, Path.cwd())

In [6]:
experiment_name = "experiment1"  # Change this to dynamically load different experiments

In [13]:
#@title Run the training in a detached background process
import subprocess
import os
import dagshub
import mlflow
from omegaconf import DictConfig, OmegaConf
from hydra.core.global_hydra import GlobalHydra
from hydra import initialize, compose

# Clear Hydra's global state if it’s already initialized
if GlobalHydra.instance().is_initialized():
    GlobalHydra.instance().clear()

initialize(config_path=f"{RELATIVE_ROOT_DIR}/configs", version_base="1.1")

cfg = compose(config_name="hydra", overrides=[f"experiments={experiment_name}"])

# Set MLflow tracking URI for DagsHub
#MLFLOW_TRACKING_URI = f"https://dagshub.com/{cfg.dagshub.repo_owner}/{cfg.dagshub.repo_name}.mlflow"

os.environ["DAGSHUB_USER"] = cfg.dagshub.repo_owner
os.environ["DAGSHUB_TOKEN"] = ""

# Initialize DagsHub logging
try:
    # Will setup MLFLOW_TRACKING_URI MLFLOW_TRACKING_USERNAME MLFLOW_TRACKING_PASSWORD
    dagshub.init(repo_owner=cfg.dagshub.repo_owner, repo_name=cfg.dagshub.repo_name, mlflow=True)
except Exception as e:
    print(f"Failed to initialize DagsHub logging: {e}")

def train_model(cfg: DictConfig):
    print("Running experiment:", cfg.experiment_name)
    print(OmegaConf.to_yaml(cfg))

    # Log parameters
    mlflow.log_params(OmegaConf.to_container(cfg, resolve=True))


    # The deepy.py script assumes it is running in the root of GTP-NeoX repo
    # Start a detached background process
    process = subprocess.Popen(
        f"nohup bash -c \"source {workspaceDir()}/my_env/bin/activate && python ./deepy.py train.py --conf_dir {GPTNeoXColabDir()}/configs shakespeare shakespeare_deepy\" & echo $! > train_process.pid",
        shell=True,
        executable='/bin/bash',
        preexec_fn=subprocess.os.setsid  # Starts the process in a new session so interrupting Notebook does not kill the training
    )

    print("Training complete.")

exp_id = GPTNeoXColab.utils.ml.get_or_create_experiment_id("tutorial")

# Set the experiment name and start the run
# mlflow.set_experiment(experiment_name)
with mlflow.start_run(experiment_id=exp_id):
    train_model(cfg)


NameError: name 'mlflow' is not defined

In [None]:
# Here we could disconnect from the GPU resource
#from google.colab import runtime
#runtime.unassign()

In [None]:
# An example of creating a Data Pipeline diagram for DagsHub repo
# https://dagshub.com/MarkNZed/GPT-NeoX-Colab/src/dvc

dvc_yaml_content = """
stages:
  preprocess:
    cmd: python preprocess_script.py
    deps:
      - data/raw_data.csv
      - preprocess_script.py
    outs:
      - data/processed_data.csv

  train:
    cmd: python train_script.py
    deps:
      - data/processed_data.csv
      - train_script.py
    outs:
      - models/model.pkl
    params:
      - training.learning_rate

  evaluate:
    cmd: python evaluate_script.py
    deps:
      - models/model.pkl
      - evaluate_script.py
    outs:
      - reports/metrics.json
"""

# Write the YAML content to dvc.yaml
#with open(f"{ROOT_DIR}/dvc.yaml", "w") as f:
#    f.write(dvc_yaml_content)
