<a href="https://colab.research.google.com/github/markNZed/GPT-NeoX-Colab/blob/main/notebooks/shakespeare_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Experiment
This is a demonstration of how experiments can be run using DagsHub and MLflow.
We will train three different versions of the tiny LLM using different batch sizes and compare the results.

## ToDo
- Shorten the training time for testing
- Run tests in parallel
- Extract functions

## Login to Dagshub
To avoid requirest in the middle of the experiment

In [None]:
import os
%pip install -q dagshub
import dagshub
try:
  from google.colab import userdata
  os.environ["DAGSHUB_USER_TOKEN"] = userdata.get("DAGSHUB_USER_TOKEN")
except:
  pass
try:
  if os.environ["DAGSHUB_USER_TOKEN"]:
    pass
except:
  os.environ["DAGSHUB_USER_TOKEN"] = dagshub.auth.get_token()
dagshub.auth.add_app_token(token=os.environ["DAGSHUB_USER_TOKEN"])

In [None]:
#@title Setup paths
# We could modify these paths to "stub" behavior for test/dev
# A file like .ipython/profile_default/startup/10-test.py could restore these vars
workspaceDir = "/content"
GPTNeoXDirName = "gpt-neox"
GPTNeoXDir = f"{workspaceDir}/{GPTNeoXDirName}"
GPTNeoXColabDirName = "GPT-NeoX-Colab"
GPTNeoXColabDir = f"{workspaceDir}/{GPTNeoXColabDirName}"

In [None]:
%%time
#@title Clone GPT-NeoX-Colab
%cd {workspaceDir}
# Don't use --depth 1 because that does not play nice with git-annex
!git clone https://github.com/markNZed/GPT-NeoX-Colab.git
%cd {GPTNeoXColabDir}
%pip install -q -r requirements_colab.txt
%pip install -q .
from dotenv import load_dotenv
import os
load_dotenv(f"{GPTNeoXColabDir}/.env")
import GPTNeoXColab
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare_text_document.bin")
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare_text_document.idx")


In [None]:
%%time
#@title Clone GPT-NeoX
%cd {workspaceDir}
#!git clone --depth 1 https://github.com/EleutherAI/gpt-neox
!git clone -b pipe_parallel_size_1 --depth 1 https://github.com/markNZed/gpt-neox.git

In [None]:
!mkdir -p {GPTNeoXDir}/processed_data
!cp {GPTNeoXColabDir}/data/shakespeare/shakespeare_text_document.* {GPTNeoXDir}/processed_data

In [None]:
%%time
#@title Load prebuilt Python environment for Colab
import GPTNeoXColab
%cd {workspaceDir}
try:
    from google.colab import userdata
    GPTNeoXColab.utils.colab.download_my_env()
except:
    pass

# Run Experiment

In [None]:
!pip install psutil
# Install this for GPU metric logging
!pip install pynvml

In [None]:
import GPTNeoXColab
import os
from pathlib import Path
ROOT_DIR = GPTNeoXColab.utils.colab.find_project_root()
RELATIVE_ROOT_DIR = os.path.relpath(ROOT_DIR, Path.cwd())

In [None]:
import os
import re
import glob
import time

# File to store the last read position (persistence between script runs)
file_position = 0
# Regular expression to match "iteration <number> / <total>"
iteration_pattern = re.compile(r"iteration\s+(\d+)\s*/\s*\d+")

def get_latest_file(dir, pattern = "*_stdout.txt"):
  # Define the log directory and pattern for log files
  glob_pattern = os.path.join(dir, pattern)
  # Get the list of log files that match the pattern
  files = glob.glob(glob_pattern)
  # Ensure there are log files in the directory
  if files:
      # Find the latest log file based on modification time
      file = max(files, key=os.path.getmtime)
      print("Latest file:", file)
  else:
      file = None
      print("No files found. Waiting and retrying.")
      time.sleep(10)  # Check every X seconds
      file = get_latest_file(dir, pattern)
  return file

def read_new_iterations(latest_log):
    global file_position
    # Open the log file and seek to the last position
    with open(latest_log, "r") as file:
        file.seek(file_position)
        # Read new lines
        new_lines = file.readlines()
        file_position = file.tell()
        # Process lines containing "iteration"
        last_match = None
        for line in new_lines:
            match = iteration_pattern.search(line)
            if match:
                last_match = match
        if last_match:
            # Extract the iteration count from the regex match
            iteration_count = int(last_match.group(1))
            print(f"{iteration_count} iterations")

# Function to check if the process is running
def is_process_running(pid):
    try:
        os.kill(pid, 0)  # Sending signal 0 to check if the process exists
        return True
    except OSError:
        return False

In [None]:
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import os

def get_scalar_from_tensorboard(file, key):
    # Load TensorBoard events
    event_acc = EventAccumulator(file)
    event_acc.Reload()
    print(event_acc.Tags())
    # Extract loss scalar events
    if key in event_acc.Tags().get('scalars', []):
        events = event_acc.Scalars(key)
        value = events[-1].value  # Get the last logged value
        return value
    else:
        print(f"{key} not found in TensorBoard logs.")
        return None

In [None]:
%pip install GitPython
%pip install ipynbname

In [None]:
from git import Repo

repo = Repo(GPTNeoXColabDir)
commit_id = repo.head.commit.hexsha
branch_name = repo.active_branch.name
repo_url = next(repo.remotes.origin.urls)
print(f"Commit ID: {commit_id}")

In [None]:
import tempfile
import subprocess
import os
from omegaconf import OmegaConf
from hydra import initialize_config_dir, compose
from hydra.core.global_hydra import GlobalHydra
import mlflow
import time
import dagshub
import ipynbname

%cd {GPTNeoXDir}

dagshub.init(repo_owner='MarkNZed', repo_name='GPT-NeoX-Colab', mlflow=True)
experiment_group = "Log only experiment parameters"
mlflow.set_experiment(experiment_group)
mlflow.enable_system_metrics_logging()

def load_and_merge_configs(base_conf_dir, experiment_name):
    # Initialize Hydra with the base config directory
    initialize_config_dir(config_dir=base_conf_dir, version_base="1.1")

    # Load the base configurations (shakespeare and shakespeare_deepy) and experiment overrides
    base_cfg = compose(config_name="shakespeare.yml")
    OmegaConf.set_struct(base_cfg, False) # No struct checking for matching structure in merge
    deepy_cfg = compose(config_name="shakespeare_deepy.yml")
    OmegaConf.set_struct(deepy_cfg, False) # No struct checking for matching structure in merge
    experiment_cfg = compose(config_name="hydra", overrides=[f"experiments={experiment_name}"])
    OmegaConf.set_struct(experiment_cfg, False) # No struct checking for matching structure in merge

    mlflow.log_params(OmegaConf.to_container(experiment_cfg, resolve=True))

    experiment_overrides = experiment_cfg.get("experiments", {})
    OmegaConf.set_struct(experiment_overrides, False) # No struct checking for matching structure in merge

    print(experiment_overrides)

    # Merge the configurations: base -> deepy -> experiment
    cfg = OmegaConf.merge(base_cfg, deepy_cfg, experiment_overrides)

    return cfg

def run_experiment(cfg, experiment_name):
    print("Running experiment:", experiment_name)
    experimentDir = f"{GPTNeoXDir}/experiments/{experiment_name}"
    !sudo rm -rf {experimentDir}
    !mkdir -p {experimentDir}
    !rm -f train_process.pid
    #print(OmegaConf.to_yaml(cfg))

    # Create a temporary directory for configs
    temp_config_dir = tempfile.mkdtemp()
    temp_config_file = os.path.join(temp_config_dir, 'temp_config.yml')

    # Save the modified config to the temporary file in JSON-like structure within a YAML file
    with open(temp_config_file, 'w') as f:
        # Dump the config as JSON but save it with a .yml extension
        OmegaConf.save(OmegaConf.create(OmegaConf.to_container(cfg, resolve=True)), f)

    # Start a detached background process using the temp config
    cmd = f"""nohup bash -c "source {workspaceDir}/my_env/bin/activate && \
        cd {GPTNeoXDir} && \
        python ./deepy.py train.py --conf_dir {temp_config_dir} \
        temp_config" & echo $! > train_process.pid"""
    print("Running command:", cmd)
    process = subprocess.Popen(
        cmd,
        shell=True,
        executable='/bin/bash',
        preexec_fn=os.setsid  # Starts the process in a new session
    )

    print("Training initiated.")

    while not os.path.exists("train_process.pid"):
        print("Waiting for train_process.pid to be created...")
        time.sleep(10)  # Check every X seconds

    # Read the PID from the file
    with open("train_process.pid", "r") as f:
        pid = int(f.read().strip())
        print("Found train_process.pid ", pid)

    while not os.path.exists(f"{experimentDir}/logs"):
        print("Waiting for logs to be created...")
        time.sleep(10)  # Check every X seconds

    latest_log = get_latest_file(f"{experimentDir}/logs", "*_stdout.txt")

    # Monitor the training process
    while is_process_running(pid):
        read_new_iterations(latest_log)
        print("Training is still running...")
        time.sleep(30)  # Check every X seconds

    print("Training has finished.")

    latest_events_file = get_latest_file(f"{experimentDir}/tensorboard", "events.out.tfevents.*")
    loss_key = "test/lm_loss"
    loss = get_scalar_from_tensorboard(latest_events_file, loss_key)
    print(f"Logging metric {loss_key} {loss}")
    mlflow.log_metric(loss_key, loss)

    # Clean up the temporary directory after training
    # (Optional: You might want to keep it for debugging)
    # shutil.rmtree(temp_config_dir)

try:
    notebook_path = ipynbname.path()
except:
    notebook_path = "shakespeare_experiment.ipynb"

# List of experiment names
experiments = ["experiment1", "experiment2", "experiment3"]

for experiment in experiments:

    client = mlflow.tracking.MlflowClient()

    with mlflow.start_run() as run:
        run_id = run.info.run_id
        # https://mlflow.org/docs/latest/tracking/tracking-api.html#system-tags
        client.set_tag(run_id, "mlflow.source.git.commit", commit_id)
        client.set_tag(run_id, "mlflow.source.git.branch", branch_name)
        client.set_tag(run_id, "mlflow.source.git.repoURL", repo_url)
        client.set_tag(run_id, "mlflow.source.type", "NOTEBOOK")
        client.set_tag(run_id, "mlflow.source.name", notebook_path)
        # Clear Hydra's global state if it’s already initialized
        if GlobalHydra.instance().is_initialized():
            GlobalHydra.instance().clear()
        # Load and merge configurations
        base_conf_dir = f"{GPTNeoXColabDir}/configs"
        cfg = load_and_merge_configs(base_conf_dir, experiment)
        # Start training with the merged configuration
        run_experiment(cfg, experiment)


In [None]:
# Here we could disconnect from the GPU resource
from google.colab import runtime
runtime.unassign()