<a href="https://colab.research.google.com/github/markNZed/GPT-NeoX-Colab/blob/main/notebooks/shakespeare_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training a tiny SLM on a corpus of Shakespeare
The intention of this notebook is to demonstrate a setup for experimenting with a tiny SLM.
The following tools are used:
* Colab (https://colab.research.google.com/) for notebook execution
* DagsHub (https://dagshub.com/) for project tracking
* MLFlow (https://mlflow.org/) for experiment tracking
* Hydra (https://hydra.cc/) for configuration management
* GPTNeoX (https://github.com/EleutherAI/gpt-neox) for model training
* Tensorboard (https://www.tensorflow.org/tensorboard) for experiment monitoring
* DVC (https://dvc.org/) for data management
* GitHub (https://github.com/) for code management
* Backblaze (https://backblaze.com/) for data storage

In [1]:
from datetime import datetime
print("Current Date and Time:", datetime.now())

Current Date and Time: 2024-11-27 08:35:52.653849


In [2]:
# We could modify these paths to "stub" behavior for test/dev
workspaceDir = "/content"
try:
  from google import colab
  gpt_neox_colabDir = f"{workspaceDir}/GPT-NeoX-Colab"
except:
  gpt_neox_colabDir = f"/workspace"
GPTNeoXDirName = "gpt-neox"
GPTNeoXDir = f"{workspaceDir}/{GPTNeoXDirName}"

# Cloning Git Repos

In [3]:
%%time
#@title Clone GPT-NeoX-Colab
%cd {workspaceDir}
try:
    from google import colab
    # Don't use --depth 1 because that does not play nice with git-annex
    !git clone --depth 1 https://github.com/markNZed/GPT-NeoX-Colab.git
except:
    pass

/content
Cloning into 'GPT-NeoX-Colab'...
remote: Enumerating objects: 72, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (62/62), done.[K
remote: Total 72 (delta 4), reused 37 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (72/72), 8.10 MiB | 14.78 MiB/s, done.
Resolving deltas: 100% (4/4), done.
CPU times: user 30.2 ms, sys: 5.6 ms, total: 35.8 ms
Wall time: 1.92 s


In [4]:
%%time
import sys
%cd {gpt_neox_colabDir}
%pip install -q python-dotenv
from dotenv import load_dotenv
import os
load_dotenv()

USE_MY_ENV = False
if USE_MY_ENV:
  # Disabling pydevd_plugins so we do not get a restart warning
  if "pydevd_plugins" in sys.modules:
    del sys.modules["pydevd_plugins"]
  %pip install -q dvc[s3]
  if not os.path.isfile("my_env.tar.gz"):
    !dvc --quiet pull my_env.tar.gz
    !tar -xf my_env.tar.gz
  activate_script = f"{gpt_neox_colabDir}/my_env/bin/activate"
else:
  try:
    from google import colab
    # Don't use --depth 1 because that does not play nice with git-annex
    !sudo apt-get update && sudo apt-get install -y python3.10-venv
    !pip install -q virtualenv
    !python3 -m venv my_env
    activate_script = f"{gpt_neox_colabDir}/my_env/bin/activate"
    !source {activate_script} && pip install -q -r requirements_colab.txt
    !source {activate_script} && pip install -q .
  except:
    activate_script = f"{gpt_neox_colabDir}/.venv/bin/activate"
    !pip install -q dvc[s3]


/content/GPT-NeoX-Colab
Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:5 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:6 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:9 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,454 kB]
Get:10 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,224 kB]
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:13 http://archive.ubuntu.com/ubuntu 

In [5]:
#@title Fetch training data
%cd {gpt_neox_colabDir}
!source {activate_script} && dvc --quiet pull "data/shakespeare/shakespeare.txt"

/content/GPT-NeoX-Colab
Collecting          |5.00 [00:00, 8.05entry/s]
Fetching
![A
  0% |          |0/? [00:00<?,    ?files/s][A
                                           [A
Querying remote cache:   0% 0/1 [00:00<?, ?files/s][A
Querying remote cache:   0% 0/1 [00:00<?, ?files/s{'info': ''}][A
                                                               [A
Fetching from s3:   0% 0/2 [00:00<?, ?file/s][A
Fetching from s3:   0% 0/2 [00:00<?, ?file/s{'info': ''}][A

  0% 0.00/1.06M [00:00<?, ?B/s][A[A

  0% 0.00/1.06M [00:00<?, ?B/s{'info': ''}][A[A

                                           [A[A
Fetching from s3: 100% 1/1 [00:00<00:00,  2.70file/s{'info': ''}][A

  0% 0.00/786 [00:00<?, ?B/s][A[A

  0% 0.00/786 [00:00<?, ?B/s{'info': ''}][A[A

                                         [A[A
Fetching
Building workspace index          |0.00 [00:00,    ?entry/s]
Comparing indexes          |4.00 [00:00,  169entry/s]
Applying changes          |1.00 [00:00,   137file/s]


In [6]:
#@title Fetch processed training data
%cd {gpt_neox_colabDir}
!source {activate_script} && dvc --quiet pull "data/shakespeare/shakespeare.jsonl"
!source {activate_script} && dvc --quiet pull "data/shakespeare/shakespeare_text_document.bin"
!source {activate_script} && dvc --quiet pull "data/shakespeare/shakespeare_text_document.idx"

/content/GPT-NeoX-Colab
Collecting          |3.00 [00:00,  170entry/s]
Fetching
![A
  0% |          |0/? [00:00<?,    ?files/s][A
                                           [A
Querying remote cache:   0% 0/1 [00:00<?, ?files/s][A
Querying remote cache:   0% 0/1 [00:00<?, ?files/s{'info': ''}][A
                                                               [A
![A
  0% Querying remote cache|          |0/0 [00:00<?,    ?files/s][A
                                                                [A
Fetching from s3:   0% 0/1 [00:00<?, ?file/s][A
Fetching from s3:   0% 0/1 [00:00<?, ?file/s{'info': ''}][A

  0% 0.00/1.43M [00:00<?, ?B/s][A[A

  0% 0.00/1.43M [00:00<?, ?B/s{'info': ''}][A[A

                                           [A[A
Fetching from s3: 100% 1/1 [00:00<00:00,  6.71file/s{'info': ''}][A
Fetching
Building workspace index          |2.00 [00:00, 4.57kentry/s]
Comparing indexes          |4.00 [00:00,  160entry/s]
Applying changes          |1.00 [00:00,   129f

In [7]:
%%time
#@title Clone GPT-NeoX
%cd {workspaceDir}
#!git clone ---depth 1 https://github.com/EleutherAI/gpt-neox
!git clone -b pipe_parallel_size_1 --depth 1 https://github.com/markNZed/gpt-neox.git

/content
Cloning into 'gpt-neox'...
remote: Enumerating objects: 296, done.[K
remote: Counting objects: 100% (296/296), done.[K
remote: Compressing objects: 100% (231/231), done.[K
remote: Total 296 (delta 74), reused 138 (delta 43), pack-reused 0 (from 0)[K
Receiving objects: 100% (296/296), 2.50 MiB | 21.15 MiB/s, done.
Resolving deltas: 100% (74/74), done.
CPU times: user 17.3 ms, sys: 2.23 ms, total: 19.5 ms
Wall time: 812 ms


# Python Environment
It is faster to download a Python virtual environment and unzip it than to install all the dependencies.

In [8]:
if not USE_MY_ENV:
    # Could not redirect to /dev/null in the standard Colab notebook (maybe no output for a particular time?)
    # Currently deepspeed from GTP-NeoX is not compatible with logging in torch >= 2.4
    !bash -c "source {activate_script}" && pip install -q torch==2.3 torchaudio==2.3.0 torchvision==0.18.0 transformers==4.38.0 sentence-transformers==2.2.2
    !bash -c "source {activate_script}" && pip install fsspec==2024.10.0 datasets==2.14.0 evaluate==0.4.3 lm-eval==0.4.1
    %cd {GPTNeoXDir}
    !bash -c "source {activate_script}" && pip install -q -r ./requirements/requirements.txt
    !bash -c "source {activate_script}" && pip install -q -r ./requirements/requirements-tensorboard.txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m107.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m105.0 MB/s[0m eta [36

# Preparing Custom Dataset

In [9]:
#@title Converting text data to jsonl format
import os

%cd {GPTNeoXDir}
!mkdir -p data

# Check if the converted file exists
if not os.path.isfile(f"{gpt_neox_colabDir}/data/shakespeare/shakespeare.jsonl"):
    gpt_neox_colab.utils.ml.text2jsonl(f"{gpt_neox_colabDir}/data/shakespeare/shakespeare.txt", f"{gpt_neox_colabDir}/data/shakespeare/shakespeare.jsonl")

!cp {gpt_neox_colabDir}/data/shakespeare/shakespeare.jsonl {GPTNeoXDir}/data/shakespeare.jsonl

/content/gpt-neox


# Tokenizing Dataset

In [10]:
%%time
#@title Tokenizing jsonl formatted data
import os

%cd {GPTNeoXDir}
!mkdir -p processed_data

# Check if the tokenized files exists
a = f"{gpt_neox_colabDir}/data/shakespeare/shakespeare_text_document.idx"
b = f"{gpt_neox_colabDir}/data/shakespeare/shakespeare_text_document.bin"
if not os.path.isfile(a) or not os.path.isfile(b):
    !bash -c "source {activate_script}" && python tools/datasets/preprocess_data.py \
        --input ./data/shakespeare.jsonl \
        --output-prefix ./processed_data \
        --tokenizer-type CharLevelTokenizer \
        --dataset-impl mmap \
        --append-eod
    !cp {GPTNeoXDir}/processed_data/shakespeare_text_document.bin {gpt_neox_colabDir}/data/shakespeare
    !cp {GPTNeoXDir}/processed_data/shakespeare_text_document.idx {gpt_neox_colabDir}/data/shakespeare

!cp {gpt_neox_colabDir}/data/shakespeare/shakespeare_text_document.bin {GPTNeoXDir}/processed_data
!cp {gpt_neox_colabDir}/data/shakespeare/shakespeare_text_document.idx {GPTNeoXDir}/processed_data

/content/gpt-neox
CPU times: user 10 ms, sys: 3.81 ms, total: 13.9 ms
Wall time: 313 ms


# Training

In [11]:
%load_ext tensorboard

In [12]:
import subprocess
# Start a detached background process using the temp config
cmd = f"""nohup bash -c "source {activate_script} && \
cd {GPTNeoXDir} && \
python ./deepy.py train.py --conf_dir {gpt_neox_colabDir}/configs shakespeare shakespeare_deepy" """
print("Running command:", cmd)
#cmd = "nohup bash -c ls" # Used to test without running on GPU

# Start the process and retrieve the PID directly
process = subprocess.Popen(
    cmd,
    shell=True,
    executable='/bin/bash',
    preexec_fn=os.setsid  # Starts the process in a new session
)

pid = process.pid
print(f"Started training with PID: {pid}")

Running command: nohup bash -c "source /content/GPT-NeoX-Colab/my_env/bin/activate && cd /content/gpt-neox && python ./deepy.py train.py --conf_dir /content/GPT-NeoX-Colab/configs shakespeare shakespeare_deepy" 
Started training with PID: 5644


In [None]:
#@title Wait until tensorboard log directory is created
import time
import os

# Path to the TensorBoard log directory
tensorboard_log_dir = f"{GPTNeoXDir}/tensorboard"

# Wait for the directory to be created
while not os.path.exists(tensorboard_log_dir):
    print("Waiting for TensorBoard log directory to be created...")
    time.sleep(10)  # Check every X seconds

print("TensorBoard log directory found. You can now launch TensorBoard.")

Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting fo

In [None]:
# Need to delete everything in checkpoints and tensorboard dir for a fresh run
%cd {GPTNeoXDir}
%tensorboard --logdir tensorboard

In [None]:
#@title Find the latest log file
import glob
import os

# Define the log directory and pattern for log files
log_dir = f"{GPTNeoXDir}/logs"
log_pattern = os.path.join(log_dir, "*_stdout.txt")

# Get the list of log files that match the pattern
log_files = glob.glob(log_pattern)

# Ensure there are log files in the directory
if log_files:
    # Find the latest log file based on modification time
    latest_log = max(log_files, key=os.path.getmtime)
    print("Latest log file:", latest_log)
else:
    latest_log = None
    print("No log files found.")


In [None]:
#@title Read the latest log file and extract the iteration count
import time
import os
import re

# File to store the last read position (persistence between script runs)
file_position = 0
# Regular expression to match "iteration <number> / <total>"
iteration_pattern = re.compile(r"iteration\s+(\d+)\s*/\s*\d+")

def read_new_iterations():
    global file_position
    # Open the log file and seek to the last position
    with open(latest_log, "r") as file:
        file.seek(file_position)
        # Read new lines
        new_lines = file.readlines()
        file_position = file.tell()
        # Process lines containing "iteration"
        last_match = None
        for line in new_lines:
            match = iteration_pattern.search(line)
            if match:
                last_match = match
        if last_match:
            # Extract the iteration count from the regex match
            iteration_count = int(last_match.group(1))
            print(f"{iteration_count} iterations")

# Periodically check if the process has completed
while True:
    # Poll the process to see if it has terminated
    if process.poll() is not None:
        # Process has completed
        print("Training has completed.")
        break
    else:
        if latest_log:
            read_new_iterations()
        elif os.path.exists(f"{experimentDir}/logs"):
            latest_log = get_latest_file(f"{experimentDir}/logs", "*_stdout.txt")
        print("Training is still running...")
        time.sleep(30)  # Check every X seconds

print("Training has finished.")


In [None]:
#@title Display training and validation Loss
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorboard.backend.event_processing import event_accumulator
import os
import numpy as np
# Path to the latest log file
log_dir = "tensorboard"
log_files = [os.path.join(log_dir, d) for d in os.listdir(log_dir)]
latest_log_dir = max(log_files, key=os.path.getmtime)

# Initialize EventAccumulator to load scalar data
ea = event_accumulator.EventAccumulator(latest_log_dir)
ea.Reload()  # Load all logs

# List all scalar keys available in the logs
scalar_keys = ea.Tags()['scalars']
print("Available scalar keys:", scalar_keys)

# Extract training and validation losses
train_loss = ea.Scalars('train/lm_loss')  # Adjust for actual name if necessary
val_loss = ea.Scalars('validation/lm_loss')  # Adjust for actual name if necessary

# Convert to lists for plotting
train_loss_values = [x.value for x in train_loss]
val_loss_values = [x.value for x in val_loss]

# Find the lengths of both arrays
len_train = len(train_loss_values)
len_val = len(val_loss_values)

iterations = None
# Interpolate the shorter array
if len_train != len_val:
    if len_train > len_val:
        # Interpolate validation loss to match the training loss length
        iterations = np.linspace(1, len_train, len_train)
        val_iterations = np.linspace(1, len_train, len_val)
        val_loss_values = np.interp(iterations, val_iterations, val_loss_values)
    else:
        # Interpolate training loss to match the validation loss length
        iterations = np.linspace(1, len_val, len_val)
        train_iterations = np.linspace(1, len_val, len_train)
        train_loss_values = np.interp(iterations, train_iterations, train_loss_values)
else:
    iterations = range(1, len_train + 1)

# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(iterations, train_loss_values, label='Training Loss')
plt.plot(iterations, val_loss_values, label='Validation Loss')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

# Inference with GPT-NeoX

In [None]:
%%time
%cd {GPTNeoXDir}
# This has issues if used during training -  The server socket has failed to bind to [::]:29500 (errno: 98 - Address already
# This will write over the logs
!bash -c "source {activate_script}" && python ./deepy.py generate.py -d configs {gpt_neox_colabDir}/configs/shakespeare {gpt_neox_colabDir}/configs/shakespeare_gen
!cat sample_output.txt

In [None]:
# 2.21.0 was the last 2 series but it asks for trust_remote_code
!bash -c "source {activate_script}" &&  pip install datasets==2.14.6

In [None]:
%%time
# This has issues if used during training -  The server socket has failed to bind to [::]:29500 (errno: 98 - Address already
# This will write over the logs
# python ./deepy.py eval.py -d configs your_configs.yml --eval_tasks task1 task2 ... taskn
# NOTE this will prompt for permission to run a download script - would need an older datasetse library to avoid this
%cd {GPTNeoXDir}
!bash -c "source {activate_script}" && python ./deepy.py eval.py -d configs {gpt_neox_colabDir}/configs/shakespeare {gpt_neox_colabDir}/configs/shakespeare_gen --eval_tasks hellaswag
!cat sample_output.txt

# Inference with Hugging Face

## Convert model to HF format
Here we are converting our model to `HuggingFace Format`.

In [None]:
import os

# Define the path to the checkpoints directory
checkpoints_dir = f"{GPTNeoXDir}/checkpoints"

# Read the 'latest' file to get the latest checkpoint name
with open(os.path.join(checkpoints_dir, "latest"), "r") as f:
    latest_checkpoint_name = f.read().strip()

# Construct the full path to the latest checkpoint directory
latest_checkpoint_path = os.path.join(checkpoints_dir, latest_checkpoint_name)
print("Path to the latest checkpoint:", latest_checkpoint_path)


In [None]:
#@title Convert last checkpoint to huggingface model
%cd {GPTNeoXDir}
!bash -c "source {activate_script}" && python ./tools/ckpts/convert_neox_to_hf.py --input_dir {latest_checkpoint_path} --config_file {gpt_neox_colabDir}/configs/shakespeare.yml --output_dir {gpt_neox_colabDir}/data/shakespeare --architecture neox

## Generate Text

In [None]:
import sys
sys.path.insert(0, f"{gpt_neox_colabDir}/my_env/lib/python3.10/site-packages")

from transformers import GPTNeoXForCausalLM
import torch

# Move to model directory
%cd {gpt_neox_colabDir}

# Assuming CharLevelTokenizer is properly imported and instantiated
from gpt_neox_colab import CharLevelTokenizer
tokenizer = CharLevelTokenizer.CharLevelTokenizer(vocab_size=512)

# Load your model
model_path = f"{gpt_neox_colabDir}/data/shakespeare"
model = GPTNeoXForCausalLM.from_pretrained(model_path)

# Define a simple char-level tokenizer if not provided
def char_level_tokenize(text):
    return tokenizer.tokenize(text)

def char_level_detokenize(tokens):
    return tokenizer.detokenize(tokens)

# Set the model to evaluation mode
model.eval()

# Prompt the user for input
#input_text = input("Enter your prompt: ")
input_text = "Thou art"

# Tokenize and prepare input
input_ids = torch.tensor([char_level_tokenize(input_text)], dtype=torch.long)
attention_mask = torch.ones_like(input_ids)  # Create an attention mask for non-padded input

# Generate text with specified pad_token_id and attention_mask
with torch.no_grad():
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=200,          # Adjust this for desired output length
        temperature=0.7,        # Controls creativity
        top_k=50,               # Controls diversity
        top_p=0.9,              # Nucleus sampling
        num_return_sequences=1, # Number of sequences to return
        pad_token_id=model.config.eos_token_id,  # Set pad_token_id explicitly
        do_sample=True           # Enable sampling mode to use temperature and top_p
    )

# Decode and print the generated text
generated_text = char_level_detokenize(output[0].tolist())
print("Generated text:", generated_text)

In [None]:
try:
    from google.colab import output
    import time
    while True:

        output.eval_js("new Audio(\"https://upload.wikimedia.org/wikipedia/commons/e/e6/Coins_dropped_in_metallic_moneybox_0.ogg\").play()")
        time.sleep(30)
except:
    # Too hard to get sound working in Docker with remote VSCode
    pass

In [None]:

import IPython
# Autplay does not work in VSCode
IPython.display.Audio(filename="/workspace/notebooks/beep-01a.mp3", autoplay=True)

In [None]:
import IPython
import numpy as np
fs = 16000.
# Autplay does not work in VSCode
IPython.display.Audio(np.sin(2*np.pi*440*np.arange(5 * fs)/fs), rate=fs, autoplay=True)

In [None]:
# Here we could disconnect from the Colab GPU resource but we will lose all results
#from google.colab import runtime
#runtime.unassign()
