<a href="https://colab.research.google.com/github/markNZed/GPT-NeoX-Colab/blob/main/notebooks/shakespeare_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Experiment
This is a demonstration of how experiments can be run using DagsHub and MLflow.
We will train three different versions of the tiny LLM using different batch sizes and compare the results.

## ToDo
- Shorten the training time for testing
- Run tests in parallel
- Extract functions

## Login to Dagshub
To avoid requirest in the middle of the experiment

In [1]:
import os
%pip install -q dagshub
import dagshub
try:
  from google.colab import userdata
  #os.environ["MLFLOW_TRACKING_PASSWORD"] = userdata.get("MLFLOW_TRACKING_PASSWORD")
  #os.environ["MLFLOW_TRACKING_USERNAME"] = userdata.get("MLFLOW_TRACKING_USERNAME")
  os.environ["DAGSHUB_USER_TOKEN"] = userdata.get("DAGSHUB_USER_TOKEN")
except:
  pass
try:
  if os.environ["DAGSHUB_USER_TOKEN"]:
    pass
except:
  os.environ["DAGSHUB_USER_TOKEN"] = dagshub.auth.get_token()
dagshub.auth.add_app_token(token=os.environ["DAGSHUB_USER_TOKEN"])

Note: you may need to restart the kernel to use updated packages.


Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=96ada6c0-0b07-4bf9-bc52-a8ca0b1f5f85&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=1bb2754ca162081ab8af4c80df6a94b990531fc7ade51e77b7a1a9ba409f1a25




The added token already exists in the token cache, skipping


In [2]:
#@title Setup paths
# We could modify these paths to "stub" behavior for test/dev
# A file like .ipython/profile_default/startup/10-test.py could restore these vars
workspaceDir = "/content"
GPTNeoXDirName = "gpt-neox"
GPTNeoXDir = f"{workspaceDir}/{GPTNeoXDirName}"
GPTNeoXColabDirName = "GPT-NeoX-Colab"
GPTNeoXColabDir = f"{workspaceDir}/{GPTNeoXColabDirName}"

In [3]:
%%time
#@title Clone GPT-NeoX-Colab
%cd {workspaceDir}
# Don't use --depth 1 because that does not play nice with git-annex
!git clone https://github.com/markNZed/GPT-NeoX-Colab.git
%cd {GPTNeoXColabDir}
%pip install -q -r requirements_colab.txt
%pip install -q .
from dotenv import load_dotenv
import os
load_dotenv(f"{GPTNeoXColabDir}/.env")
import GPTNeoXColab
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare_text_document.bin")
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare_text_document.idx")


/content
Cloning into 'GPT-NeoX-Colab'...


remote: Enumerating objects: 1168, done.[K
remote: Counting objects: 100% (116/116), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 1168 (delta 61), reused 63 (delta 31), pack-reused 1052 (from 1)[K
Receiving objects: 100% (1168/1168), 13.80 MiB | 3.16 MiB/s, done.
Resolving deltas: 100% (638/638), done.
/content/GPT-NeoX-Colab
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Data retrieval successful.
Data retrieval successful.
CPU times: user 12.4 s, sys: 1.23 s, total: 13.6 s
Wall time: 1min


In [4]:
%%time
#@title Clone GPT-NeoX
%cd {workspaceDir}
!git clone --depth 1 https://github.com/EleutherAI/gpt-neox

/content
Cloning into 'gpt-neox'...


remote: Enumerating objects: 296, done.[K
remote: Counting objects: 100% (296/296), done.[K
remote: Compressing objects: 100% (231/231), done.[K
remote: Total 296 (delta 74), reused 134 (delta 43), pack-reused 0 (from 0)[K
Receiving objects: 100% (296/296), 2.50 MiB | 2.12 MiB/s, done.
Resolving deltas: 100% (74/74), done.
CPU times: user 71.1 ms, sys: 38.9 ms, total: 110 ms
Wall time: 2.22 s


In [5]:
!mkdir -p {GPTNeoXDir}/processed_data
!cp {GPTNeoXColabDir}/data/shakespeare/shakespeare_text_document.* {GPTNeoXDir}/processed_data

In [None]:
%%time
#@title Load prebuilt Python environment for Colab
import GPTNeoXColab
%cd {workspaceDir}
try:
    from google.colab import userdata
    GPTNeoXColab.utils.colab.download_my_env()
except:
    pass

/content
CPU times: user 3.3 ms, sys: 1.93 ms, total: 5.23 ms
Wall time: 3.72 ms


# Run Experiment

In [7]:
import GPTNeoXColab
import os
from pathlib import Path
ROOT_DIR = GPTNeoXColab.utils.colab.find_project_root()
RELATIVE_ROOT_DIR = os.path.relpath(ROOT_DIR, Path.cwd())

In [12]:
import tempfile
import subprocess
import os
from omegaconf import OmegaConf
from hydra import initialize, compose
from hydra.core.global_hydra import GlobalHydra
import mlflow

%cd {GPTNeoXDir}

def load_and_merge_configs(base_conf_dir, experiment_name):
    # Initialize Hydra with the base config directory
    initialize(config_path=base_conf_dir, version_base="1.1")

    # Load the base configurations (shakespeare and shakespeare_deepy) and experiment overrides
    base_cfg = compose(config_name="shakespeare.yml")
    OmegaConf.set_struct(base_cfg, False) # No struct checking for matching structure in merge
    deepy_cfg = compose(config_name="shakespeare_deepy.yml")
    OmegaConf.set_struct(deepy_cfg, False) # No struct checking for matching structure in merge
    experiment_cfg = compose(config_name="hydra", overrides=[f"experiments={experiment_name}"])
    OmegaConf.set_struct(experiment_cfg, False) # No struct checking for matching structure in merge

    # Merge the configurations: base -> deepy -> experiment
    cfg = OmegaConf.merge(base_cfg, deepy_cfg, experiment_cfg)

    return cfg

def run_experiment(cfg):
    print("Running experiment:", cfg.experiment_name)
    #print(OmegaConf.to_yaml(cfg))

    # Log parameters
    mlflow.log_params(OmegaConf.to_container(cfg, resolve=True))

    # Create a temporary directory for configs
    temp_config_dir = tempfile.mkdtemp()
    temp_config_file = os.path.join(temp_config_dir, 'temp_config.yml')

    # Save the modified config to the temporary file in JSON-like structure within a YAML file
    with open(temp_config_file, 'w') as f:
        # Dump the config as JSON but save it with a .yml extension
        OmegaConf.save(OmegaConf.create(OmegaConf.to_container(cfg, resolve=True)), f)

    breakpoint()

    # Start a detached background process using the temp config
    process = subprocess.Popen(
        f"""nohup bash -c "source {workspaceDir}/my_env/bin/activate && \
        python ./deepy.py train.py --conf_dir {temp_config_dir} \
        temp_config" & echo $! > train_process.pid""",
        shell=True,
        executable='/bin/bash',
        preexec_fn=os.setsid  # Starts the process in a new session
    )

    print("Training initiated.")

    # Clean up the temporary directory after training
    # (Optional: You might want to keep it for debugging)
    # shutil.rmtree(temp_config_dir)

# List of experiment names
experiments = ["experiment1", "experiment2", "experiment3"]

for experiment in experiments:

    exp_id = GPTNeoXColab.utils.ml.get_or_create_experiment_id("GPT-NeoX-Colab-Experiments")
    # Set the experiment name and start the run
    # mlflow.set_experiment(experiment_name)
    with mlflow.start_run(experiment_id=exp_id, nested=True):
        # Clear Hydra's global state if it’s already initialized
        if GlobalHydra.instance().is_initialized():
            GlobalHydra.instance().clear()
        # Load and merge configurations
        base_conf_dir = f"{RELATIVE_ROOT_DIR}/configs"
        experiment_name = experiment
        cfg = load_and_merge_configs(base_conf_dir, experiment_name)

        # Start training with the merged configuration
        run_experiment(cfg)


/content/gpt-neox
Running experiment: base_experiment
Training initiated.


bash: line 1: /content/my_env/bin/activate: No such file or directory


MissingConfigException: In 'hydra': Could not find 'experiments/experiment2'

Available options in 'experiments':
	experiment1
Config search path:
	provider=hydra, path=pkg://hydra.conf
	provider=main, path=file:///workspace/notebooks/GPT-NeoX-Colab/configs
	provider=schema, path=structured://

In [None]:
#@title Wait until logs directory is created
import time
import os

# Path to the log directory
logs_dir = f"{GPTNeoXDir}/logs"

# Wait for the directory to be created
while not os.path.exists(logs_dir):
    print("Waiting for logs directory to be created...")
    time.sleep(10)  # Check every X seconds

print("ogs directory found")

ogs directory found


In [None]:
#@title Find the latest log file
import glob
import os

# Define the log directory and pattern for log files
log_dir = f"{GPTNeoXDir}/logs"
log_pattern = os.path.join(log_dir, "*_stdout.txt")

# Get the list of log files that match the pattern
log_files = glob.glob(log_pattern)

# Ensure there are log files in the directory
if log_files:
    # Find the latest log file based on modification time
    latest_log = max(log_files, key=os.path.getmtime)
    print("Latest log file:", latest_log)
else:
    latest_log = None
    print("No log files found.")


Latest log file: /content/gpt-neox/logs/7883f81576c3_stdout.txt


In [None]:
#@title Read the latest log file and extract the iteration count
import time
import os
import re

# File to store the last read position (persistence between script runs)
file_position = 0
# Regular expression to match "iteration <number> / <total>"
iteration_pattern = re.compile(r"iteration\s+(\d+)\s*/\s*\d+")

def read_new_iterations():
    global file_position
    # Open the log file and seek to the last position
    with open(latest_log, "r") as file:
        file.seek(file_position)
        # Read new lines
        new_lines = file.readlines()
        file_position = file.tell()
        # Process lines containing "iteration"
        last_match = None
        for line in new_lines:
            match = iteration_pattern.search(line)
            if match:
                last_match = match
        if last_match:
            # Extract the iteration count from the regex match
            iteration_count = int(last_match.group(1))
            print(f"{iteration_count} iterations")

# Read the PID from the file
with open("train_process.pid", "r") as f:
    pid = int(f.read().strip())
    print("PID:", pid)

# Function to check if the process is running
def is_process_running(pid):
    try:
        os.kill(pid, 0)  # Sending signal 0 to check if the process exists
        return True
    except OSError:
        return False

# Monitor the training process
while is_process_running(pid):
    read_new_iterations()
    print("Training is still running...")
    time.sleep(30)  # Check every X seconds

print("Training has finished.")


PID: 12077
800 iterations
Training is still running...
Training has finished.


In [None]:
# Here we could disconnect from the GPU resource
#from google.colab import runtime
#runtime.unassign()