<a href="https://colab.research.google.com/github/markNZed/GPT-NeoX-Colab/blob/main/notebooks/shakespeare_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Experiment
This is a demonstration of how experiments can be run using DagsHub and MLflow.
We will train three different versions of the tiny LLM using different batch sizes and compare the results.

## ToDo
- Shorten the training time for testing
- Run tests in parallel
- Extract functions

## Login to Dagshub
To avoid requirest in the middle of the experiment

In [24]:
import os
%pip install -q dagshub
import dagshub
try:
  from google.colab import userdata
  os.environ["DAGSHUB_USER_TOKEN"] = userdata.get("DAGSHUB_USER_TOKEN")
except:
  pass
try:
  if os.environ["DAGSHUB_USER_TOKEN"]:
    pass
except:
  os.environ["DAGSHUB_USER_TOKEN"] = dagshub.auth.get_token()
dagshub.auth.add_app_token(token=os.environ["DAGSHUB_USER_TOKEN"])



In [25]:
#@title Setup paths
# We could modify these paths to "stub" behavior for test/dev
# A file like .ipython/profile_default/startup/10-test.py could restore these vars
workspaceDir = "/content"
GPTNeoXDirName = "gpt-neox"
GPTNeoXDir = f"{workspaceDir}/{GPTNeoXDirName}"
GPTNeoXColabDirName = "GPT-NeoX-Colab"
GPTNeoXColabDir = f"{workspaceDir}/{GPTNeoXColabDirName}"

In [26]:
%%time
#@title Clone GPT-NeoX-Colab
%cd {workspaceDir}
# Don't use --depth 1 because that does not play nice with git-annex
!git clone https://github.com/markNZed/GPT-NeoX-Colab.git
%cd {GPTNeoXColabDir}
%pip install -q -r requirements_colab.txt
%pip install -q .
from dotenv import load_dotenv
import os
load_dotenv(f"{GPTNeoXColabDir}/.env")
import GPTNeoXColab
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare_text_document.bin")
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare_text_document.idx")


/content
Cloning into 'GPT-NeoX-Colab'...
remote: Enumerating objects: 1205, done.[K
remote: Counting objects: 100% (153/153), done.[K
remote: Compressing objects: 100% (109/109), done.[K
remote: Total 1205 (delta 81), reused 87 (delta 40), pack-reused 1052 (from 1)[K
Receiving objects: 100% (1205/1205), 13.81 MiB | 16.66 MiB/s, done.
Resolving deltas: 100% (658/658), done.
/content/GPT-NeoX-Colab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for GPTNeoXColab (pyproject.toml) ... [?25l[?25hdone
Data retrieval successful.
Data retrieval successful.
CPU times: user 229 ms, sys: 28.7 ms, total: 258 ms
Wall time: 32.7 s


In [27]:
%%time
#@title Clone GPT-NeoX
%cd {workspaceDir}
!git clone --depth 1 https://github.com/EleutherAI/gpt-neox

/content
fatal: destination path 'gpt-neox' already exists and is not an empty directory.
CPU times: user 5.26 ms, sys: 735 µs, total: 5.99 ms
Wall time: 111 ms


In [28]:
!mkdir -p {GPTNeoXDir}/processed_data
!cp {GPTNeoXColabDir}/data/shakespeare/shakespeare_text_document.* {GPTNeoXDir}/processed_data

In [29]:
%%time
#@title Load prebuilt Python environment for Colab
import GPTNeoXColab
%cd {workspaceDir}
try:
    from google.colab import userdata
    GPTNeoXColab.utils.colab.download_my_env()
except:
    pass

/content
CPU times: user 9.86 ms, sys: 1.74 ms, total: 11.6 ms
Wall time: 10.5 ms


# Run Experiment

In [30]:
import GPTNeoXColab
import os
from pathlib import Path
ROOT_DIR = GPTNeoXColab.utils.colab.find_project_root()
RELATIVE_ROOT_DIR = os.path.relpath(ROOT_DIR, Path.cwd())

In [31]:
import os
import re

# File to store the last read position (persistence between script runs)
file_position = 0
# Regular expression to match "iteration <number> / <total>"
iteration_pattern = re.compile(r"iteration\s+(\d+)\s*/\s*\d+")

def read_new_iterations():
    global file_position
    # Open the log file and seek to the last position
    with open(latest_log, "r") as file:
        file.seek(file_position)
        # Read new lines
        new_lines = file.readlines()
        file_position = file.tell()
        # Process lines containing "iteration"
        last_match = None
        for line in new_lines:
            match = iteration_pattern.search(line)
            if match:
                last_match = match
        if last_match:
            # Extract the iteration count from the regex match
            iteration_count = int(last_match.group(1))
            print(f"{iteration_count} iterations")

# Function to check if the process is running
def is_process_running(pid):
    try:
        os.kill(pid, 0)  # Sending signal 0 to check if the process exists
        return True
    except OSError:
        return False


In [38]:
import tempfile
import subprocess
import os
from omegaconf import OmegaConf
from hydra import initialize_config_dir, compose
from hydra.core.global_hydra import GlobalHydra
import mlflow
import time
import dagshub

%cd {GPTNeoXDir}

dagshub.init(repo_owner='MarkNZed', repo_name='GPT-NeoX-Colab', mlflow=True)
#mlflow.set_experiment("GPT-NeoX-Colab-Experiments")
experiment_group = "Testing1"

def load_and_merge_configs(base_conf_dir, experiment_name):
    # Initialize Hydra with the base config directory
    initialize_config_dir(config_dir=base_conf_dir, version_base="1.1")

    # Load the base configurations (shakespeare and shakespeare_deepy) and experiment overrides
    base_cfg = compose(config_name="shakespeare.yml")
    OmegaConf.set_struct(base_cfg, False) # No struct checking for matching structure in merge
    deepy_cfg = compose(config_name="shakespeare_deepy.yml")
    OmegaConf.set_struct(deepy_cfg, False) # No struct checking for matching structure in merge
    experiment_cfg = compose(config_name="hydra", overrides=[f"experiments={experiment_name}"])
    OmegaConf.set_struct(experiment_cfg, False) # No struct checking for matching structure in merge

    experiment_overrides = experiment_cfg.get("experiments", {})
    OmegaConf.set_struct(experiment_overrides, False) # No struct checking for matching structure in merge

    print(experiment_overrides)

    # Merge the configurations: base -> deepy -> experiment
    cfg = OmegaConf.merge(base_cfg, deepy_cfg, experiment_overrides)

    return cfg

def run_experiment(cfg, experiment_name):
    print("Running experiment:", experiment_name)
    experimentDir = f"{GPTNeoXDir}/experiments/{experiment_name}"
    !mkdir -p {experimentDir}
    #print(OmegaConf.to_yaml(cfg))

    # Log parameters
    mlflow.log_params(OmegaConf.to_container(cfg, resolve=True))

    # Create a temporary directory for configs
    temp_config_dir = tempfile.mkdtemp()
    temp_config_file = os.path.join(temp_config_dir, 'temp_config.yml')

    # Save the modified config to the temporary file in JSON-like structure within a YAML file
    with open(temp_config_file, 'w') as f:
        # Dump the config as JSON but save it with a .yml extension
        OmegaConf.save(OmegaConf.create(OmegaConf.to_container(cfg, resolve=True)), f)

    # Start a detached background process using the temp config
    cmd = f"""nohup bash -c "source {workspaceDir}/my_env/bin/activate && \
        cd {GPTNeoXDir} && \
        python ./deepy.py train.py --conf_dir {temp_config_dir} \
        temp_config" & echo $! > train_process.pid"""
    print("Running command:", cmd)
    process = subprocess.Popen(
        cmd,
        shell=True,
        executable='/bin/bash',
        preexec_fn=os.setsid  # Starts the process in a new session
    )

    print("Training initiated.")

    while not os.path.exists("train_process.pid"):
        print("Waiting for train_process.pid to be created...")
        time.sleep(10)  # Check every X seconds

    # Read the PID from the file
    with open("train_process.pid", "r") as f:
        pid = int(f.read().strip())
        print("PID:", pid)

    # Monitor the training process
    while is_process_running(pid):
        read_new_iterations()
        print("Training is still running...")
        time.sleep(30)  # Check every X seconds

    print("Training has finished.")

    !rm "train_process.pid"

    # Clean up the temporary directory after training
    # (Optional: You might want to keep it for debugging)
    # shutil.rmtree(temp_config_dir)

# List of experiment names
experiments = ["experiment1", "experiment2", "experiment3"]

for experiment in experiments:

    exp_id = GPTNeoXColab.utils.ml.get_or_create_experiment_id(experiment_group)
    # Set the experiment name and start the run
    with mlflow.start_run(experiment_id=exp_id, nested=True):
    #with mlflow.start_run():
        # Clear Hydra's global state if it’s already initialized
        if GlobalHydra.instance().is_initialized():
            GlobalHydra.instance().clear()
        # Load and merge configurations
        base_conf_dir = f"{GPTNeoXColabDir}/configs"
        cfg = load_and_merge_configs(base_conf_dir, experiment)
        # Start training with the merged configuration
        run_experiment(cfg, experiment)


/content/gpt-neox


{'train_micro_batch_size_per_gpu': 4, 'train_iters': 5120, 'lr_decay_iters': 5120, 'save': 'experiments/experiment1/checkpoints', 'load': 'experiments/experiment1/checkpoints', 'tensorboard_dir': 'experiments/experiment1/tensorboard', 'log_dir': 'experiments/experiment1/logs'}
Running experiment: experiment1
Running command: nohup bash -c "source /content/my_env/bin/activate &&         cd /content/gpt-neox &&         python ./deepy.py train.py --conf_dir /tmp/tmpmh9ahfkr         temp_config" & echo $! > train_process.pid
Training initiated.
Waiting for train_process.pid to be created...
PID: 18396
Training is still running...
Training has finished.


2024/11/11 21:17:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run respected-elk-542 at: https://dagshub.com/MarkNZed/GPT-NeoX-Colab.mlflow/#/experiments/4/runs/2376408010f14388809fb984952a8f5d.
2024/11/11 21:17:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/MarkNZed/GPT-NeoX-Colab.mlflow/#/experiments/4.


{'train_micro_batch_size_per_gpu': 86, 'train_iters': 320, 'lr_decay_iters': 320, 'save': 'experiments/experiment2/checkpoints', 'load': 'experiments/experiment2/checkpoints', 'tensorboard_dir': 'experiments/experiment2/tensorboard', 'log_dir': 'experiments/experiment2/logs'}
Running experiment: experiment2
Running command: nohup bash -c "source /content/my_env/bin/activate &&         cd /content/gpt-neox &&         python ./deepy.py train.py --conf_dir /tmp/tmpiturlh5u         temp_config" & echo $! > train_process.pid
Training initiated.
Waiting for train_process.pid to be created...
PID: 18681
Training is still running...
Training has finished.


2024/11/11 21:18:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run awesome-lynx-330 at: https://dagshub.com/MarkNZed/GPT-NeoX-Colab.mlflow/#/experiments/4/runs/9cb30082e2af428ea41b95b47c6da78c.
2024/11/11 21:18:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/MarkNZed/GPT-NeoX-Colab.mlflow/#/experiments/4.


{'train_micro_batch_size_per_gpu': 256, 'train_iters': 80, 'lr_decay_iters': 80, 'save': 'experiments/experiment3/checkpoints', 'load': 'experiments/experiment3/checkpoints', 'tensorboard_dir': 'experiments/experiment3/tensorboard', 'log_dir': 'experiments/experiment3/logs'}
Running experiment: experiment3
Running command: nohup bash -c "source /content/my_env/bin/activate &&         cd /content/gpt-neox &&         python ./deepy.py train.py --conf_dir /tmp/tmpcm7ytx_h         temp_config" & echo $! > train_process.pid
Training initiated.
Waiting for train_process.pid to be created...
PID: 19094
Training is still running...
Training has finished.


2024/11/11 21:19:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run wistful-chimp-539 at: https://dagshub.com/MarkNZed/GPT-NeoX-Colab.mlflow/#/experiments/4/runs/a96ec22105bf4cb08ecbf8d21f23193f.
2024/11/11 21:19:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/MarkNZed/GPT-NeoX-Colab.mlflow/#/experiments/4.


In [33]:
#@title Wait until logs directory is created
import time
import os

# Path to the log directory
logs_dir = f"{GPTNeoXDir}/logs"

# Wait for the directory to be created
while not os.path.exists(logs_dir):
    print("Waiting for logs directory to be created...")
    time.sleep(10)  # Check every X seconds

print("logs directory found")

logs directory found


In [34]:
#@title Find the latest log file
import glob
import os

# Define the log directory and pattern for log files
log_dir = f"{GPTNeoXDir}/logs"
log_pattern = os.path.join(log_dir, "*_stdout.txt")

# Get the list of log files that match the pattern
log_files = glob.glob(log_pattern)

# Ensure there are log files in the directory
if log_files:
    # Find the latest log file based on modification time
    latest_log = max(log_files, key=os.path.getmtime)
    print("Latest log file:", latest_log)
else:
    latest_log = None
    print("No log files found.")


Latest log file: /content/gpt-neox/logs/6596c01fd4ce_stdout.txt


In [35]:
# Here we could disconnect from the GPU resource
#from google.colab import runtime
#runtime.unassign()