<a href="https://colab.research.google.com/github/markNZed/GPT-NeoX-Colab/blob/main/notebooks/shakespeare_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training a tiny SLM on a corpus of Shakespeare
The intention of this notebook is to demonstrate a setup for experimenting with a tiny SLM.
The following tools are used:
* Colab (https://colab.research.google.com/) for notebook execution
* DagsHub (https://dagshub.com/) for project tracking
* MLFlow (https://mlflow.org/) for experiment tracking
* Hydra (https://hydra.cc/) for configuration management
* GPTNeoX (https://github.com/EleutherAI/gpt-neox) for model training
* Tensorboard (https://www.tensorflow.org/tensorboard) for experiment monitoring
* DVC (https://dvc.org/) for data management
* GitHub (https://github.com/) for code management
* Backblaze (https://backblaze.com/) for data storage

In [None]:
from datetime import datetime
print("Current Date and Time:", datetime.now())

In [None]:
try:
  from google import colab
  isColab = True
except:
  isColab = False

try:
  from google.colab import userdata
  if userdata.get('GITHUB_NAME'):
    !git config --global user.name "{userdata.get('GITHUB_NAME')}"
  if userdata.get('GITHUB_EMAIL'):
    !git config --global user.email "{userdata.get('GITHUB_EMAIL')}"
  if userdata.get('AWS_SECRET_ACCESS_KEY'):
    !echo "export AWS_SECRET_ACCESS_KEY={userdata.get('AWS_SECRET_ACCESS_KEY')}" >> ~/.bashrc
    os.environ['AWS_SECRET_ACCESS_KEY'] = userdata.get('AWS_SECRET_ACCESS_KEY')
  if userdata.get('AWS_ACCESS_KEY_ID'):
    !echo "export AWS_ACCESS_KEY_ID={userdata.get('AWS_ACCESS_KEY_ID')}" >> ~/.bashrc
    os.environ['AWS_ACCESS_KEY_ID'] = userdata.get('AWS_ACCESS_KEY_ID')
except:
   pass

print("isColab:", isColab)

In [None]:
# We could modify these paths to "stub" behavior for test/dev
workspaceDir = "/content"
gpt_neox_colabDir = f"{workspaceDir}/GPT-NeoX-Colab"
if not isColab:
  !sudo ln -s /workspace /content/GPT-NeoX-Colab
GPTNeoXDirName = "gpt-neox"
GPTNeoXDir = f"{workspaceDir}/{GPTNeoXDirName}"

# Cloning Git Repos

In [None]:
%%time
#@title Clone GPT-NeoX-Colab
%cd {workspaceDir}
if isColab:
  # Don't use --depth 1 because that does not play nice with git-annex
  if userdata.get('GITHUB_PAT'):
    !git clone --depth 1 https://{userdata.get('GITHUB_PAT')}@github.com/markNZed/GPT-NeoX-Colab.git
    print("Cloned with PAT")
  else:
    !git clone --depth 1 https://github.com/markNZed/GPT-NeoX-Colab.git
    print("Cloned without PAT")

In [None]:
%%time
import sys
%cd {gpt_neox_colabDir}
print("Install DVC")
%pip install -q python-dotenv dvc[s3]==3.2.0 s3fs==2024.2.0 fsspec==2024.2.0 gcsfs==2024.2.0
from dotenv import load_dotenv
import os
load_dotenv()
activate_script = f"{gpt_neox_colabDir}/.venv/bin/activate"
USE_VENV = False
if USE_VENV:
  # Disabling pydevd_plugins so we do not get a restart warning
  #if "pydevd_plugins" in sys.modules:
  #  del sys.modules["pydevd_plugins"]
  venv_dir = ".venv"
  venv_tar_file = f"{venv_dir}.tar"
  venv_gz_file = f"{venv_tar_file}.gz"
  if not os.path.isdir(venv_dir):
    if not os.path.isfile(venv_gz_file):
      print(f"Downloading {venv_gz_file}")
      !dvc pull -q {venv_gz_file}
    if not os.path.isfile(venv_tar_file):
      print(f"Unzipping {venv_gz_file}")
      !sudo apt-get install -y pigz
      !pigz -d -p 8 {venv_gz_file}
    if not os.path.isfile(venv_dir):
      print(f"Untarring {venv_tar_file}")
      !tar -xf {venv_tar_file}
      !rm {venv_tar_file}
elif isColab:
    !uv sync -q --dev
    !uv run pip install -q -e .
    !source {activate_script} && pip install -q -r requirements_colab.txt
    !source {activate_script} && pip install -q .

In [29]:
#@title Fetch training data
%cd {gpt_neox_colabDir}
!dvc --quiet pull -q "data/shakespeare"

/content/GPT-NeoX-Colab
[0m

In [30]:
%%time
#@title Clone GPT-NeoX
%cd {workspaceDir}
#!git clone ---depth 1 https://github.com/EleutherAI/gpt-neox
!git clone -b pipe_parallel_size_1 --depth 1 https://github.com/markNZed/gpt-neox.git

/content
fatal: destination path 'gpt-neox' already exists and is not an empty directory.
CPU times: user 7.02 ms, sys: 0 ns, total: 7.02 ms
Wall time: 106 ms


# Python Environment
It is faster to download a Python virtual environment and unzip it than to install all the dependencies.

In [31]:
if not USE_VENV:
    # Could not redirect to /dev/null in the standard Colab notebook (maybe no output for a particular time?)
    # Currently deepspeed from GTP-NeoX is not compatible with logging in torch >= 2.4
    !source {activate_script} && pip install -q torch==2.3 torchaudio==2.3.0 torchvision==0.18.0 transformers==4.38.0 sentence-transformers==2.2.2
    !source {activate_script} && pip install -q fsspec==2024.2.0 datasets==2.14.0 evaluate==0.4.3 lm-eval==0.4.1 tensorboard==2.17.1 tensorflow==2.17.1
    %cd {GPTNeoXDir}
    !source {activate_script} && pip install -q -r ./requirements/requirements.txt

/content/gpt-neox
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for mpi4py (pyproject.toml) ... [?25l[?25hdone


# Preparing Custom Dataset

In [32]:
#@title Converting text data to jsonl format
import os

%cd {GPTNeoXDir}
!mkdir -p data

# Check if the converted file exists
if not os.path.isfile(f"{gpt_neox_colabDir}/data/shakespeare/shakespeare.jsonl"):
    !source {activate_script} && python -c "import gpt_neox_colab.utils; gpt_neox_colab.utils.ml.text2jsonl(\"{gpt_neox_colabDir}/data/shakespeare/shakespeare.txt\", \"{gpt_neox_colabDir}/data/shakespeare/shakespeare.jsonl\")"

!cp {gpt_neox_colabDir}/data/shakespeare/shakespeare.jsonl {GPTNeoXDir}/data/shakespeare.jsonl

/content/gpt-neox


# Tokenizing Dataset

In [33]:
%%time
#@title Tokenizing jsonl formatted data
import os

%cd {GPTNeoXDir}
!mkdir -p processed_data
%cd processed_data

# Check if the tokenized files exists
a = f"{gpt_neox_colabDir}/data/shakespeare/shakespeare_text_document.idx"
b = f"{gpt_neox_colabDir}/data/shakespeare/shakespeare_text_document.bin"
if not os.path.isfile(a) or not os.path.isfile(b):
    cmd = f"""
    source {activate_script} && python {GPTNeoXDir}/tools/datasets/preprocess_data.py \
        --input {GPTNeoXDir}/data/shakespeare.jsonl \
        --output-prefix shakespeare \
        --tokenizer-type CharLevelTokenizer \
        --dataset-impl mmap \
        --append-eod
    """
    print(f"Command: {cmd}")
    !source {activate_script} && python {GPTNeoXDir}/tools/datasets/preprocess_data.py \
        --input {GPTNeoXDir}/data/shakespeare.jsonl \
        --output-prefix shakespeare \
        --tokenizer-type CharLevelTokenizer \
        --dataset-impl mmap \
        --append-eod
    !cp {GPTNeoXDir}/processed_data/shakespeare_text_document.bin {gpt_neox_colabDir}/data/shakespeare
    !cp {GPTNeoXDir}/processed_data/shakespeare_text_document.idx {gpt_neox_colabDir}/data/shakespeare

!cp {gpt_neox_colabDir}/data/shakespeare/shakespeare_text_document.bin {GPTNeoXDir}/processed_data
!cp {gpt_neox_colabDir}/data/shakespeare/shakespeare_text_document.idx {GPTNeoXDir}/processed_data

/content/gpt-neox
/content/gpt-neox/processed_data
CPU times: user 15.2 ms, sys: 1.2 ms, total: 16.4 ms
Wall time: 312 ms


In [34]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [35]:
import subprocess
# Start a detached background process using the temp config
# If we want to run from scratch again: rm -rf /content/gpt-neox/logs/* /content/gpt-neox/tensorboard/* /content/gpt-neox/checkpoints/*
cmd = f"""
nohup nohup bash -c " source {activate_script} && \
cd {GPTNeoXDir} && \
python ./deepy.py train.py --conf_dir {gpt_neox_colabDir}/configs shakespeare shakespeare_deepy "
"""
print("Running command:", cmd)
#cmd = "nohup bash -c ls" # Used to test without running on GPU

# Redirect stdout and stderr to os.devnull
with open(os.devnull, 'w') as devnull:
    process = subprocess.Popen(
        cmd,
        shell=True,
        executable='/bin/bash',
        preexec_fn=os.setsid,  # Starts the process in a new session
        stdout=devnull,  # Suppress stdout
        stderr=devnull   # Suppress stderr
    )

pid = process.pid
print(f"Started training with PID: {pid}")

Running command: 
nohup nohup bash -c " source /content/GPT-NeoX-Colab/.venv/bin/activate && cd /content/gpt-neox && python ./deepy.py train.py --conf_dir /content/GPT-NeoX-Colab/configs shakespeare shakespeare_deepy "

Started training with PID: 4824


In [36]:
#@title Wait until tensorboard log directory is created
import time
import os

# Path to the TensorBoard log directory
tensorboard_log_dir = f"{GPTNeoXDir}/tensorboard"

# Wait for the directory to be created
while not os.path.exists(tensorboard_log_dir):
    print("Waiting for TensorBoard log directory to be created...")
    time.sleep(10)  # Check every X seconds

print("TensorBoard log directory found. You can now launch TensorBoard.")

Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting for TensorBoard log directory to be created...
Waiting fo

KeyboardInterrupt: 

In [None]:
# Need to delete everything in checkpoints and tensorboard dir for a fresh run
%cd {GPTNeoXDir}
%tensorboard --logdir tensorboard

In [None]:
#@title Find the latest log file
import glob
import os

# Define the log directory and pattern for log files
experimentDir = f"{GPTNeoXDir}/logs"
log_pattern = os.path.join(experimentDir, "*_stdout.txt")

# Get the list of log files that match the pattern
log_files = glob.glob(log_pattern)

# Ensure there are log files in the directory
if log_files:
    # Find the latest log file based on modification time
    latest_log = max(log_files, key=os.path.getmtime)
    print("Latest log file:", latest_log)
else:
    latest_log = None
    print("No log files found.")


# Training

In [None]:
#@title Read the latest log file and extract the iteration count
import time
import os
import re

# File to store the last read position (persistence between script runs)
file_position = 0
# Regular expression to match "iteration <number> / <total>"
iteration_pattern = re.compile(r"iteration\s+(\d+)\s*/\s*\d+")

def read_new_iterations():
    global file_position
    # Open the log file and seek to the last position
    with open(latest_log, "r") as file:
        file.seek(file_position)
        # Read new lines
        new_lines = file.readlines()
        file_position = file.tell()
        # Process lines containing "iteration"
        last_match = None
        for line in new_lines:
            match = iteration_pattern.search(line)
            if match:
                last_match = match
        if last_match:
            # Extract the iteration count from the regex match
            iteration_count = int(last_match.group(1))
            print(f"{iteration_count} iterations")

# Periodically check if the process has completed
while True:
    # Poll the process to see if it has terminated
    if process.poll() is not None:
        # Process has completed
        print("Training has completed.")
        break
    else:
        if latest_log:
            read_new_iterations()
        elif os.path.exists(f"{experimentDir}/logs"):
            latest_log = get_latest_file(f"{experimentDir}/logs", "*_stdout.txt")
        print("Training is still running...")
        time.sleep(30)  # Check every X seconds

print("Training has finished.")


In [None]:
#@title Display training and validation Loss
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorboard.backend.event_processing import event_accumulator
import os
import numpy as np
# Path to the latest log file
log_dir = "tensorboard"
log_files = [os.path.join(log_dir, d) for d in os.listdir(log_dir)]
latest_log_dir = max(log_files, key=os.path.getmtime)

# Initialize EventAccumulator to load scalar data
ea = event_accumulator.EventAccumulator(latest_log_dir)
ea.Reload()  # Load all logs

# List all scalar keys available in the logs
scalar_keys = ea.Tags()['scalars']
print("Available scalar keys:", scalar_keys)

# Extract training and validation losses
train_loss = ea.Scalars('train/lm_loss')  # Adjust for actual name if necessary
val_loss = ea.Scalars('validation/lm_loss')  # Adjust for actual name if necessary

# Convert to lists for plotting
train_loss_values = [x.value for x in train_loss]
val_loss_values = [x.value for x in val_loss]

# Find the lengths of both arrays
len_train = len(train_loss_values)
len_val = len(val_loss_values)

iterations = None
# Interpolate the shorter array
if len_train != len_val:
    if len_train > len_val:
        # Interpolate validation loss to match the training loss length
        iterations = np.linspace(1, len_train, len_train)
        val_iterations = np.linspace(1, len_train, len_val)
        val_loss_values = np.interp(iterations, val_iterations, val_loss_values)
    else:
        # Interpolate training loss to match the validation loss length
        iterations = np.linspace(1, len_val, len_val)
        train_iterations = np.linspace(1, len_val, len_train)
        train_loss_values = np.interp(iterations, train_iterations, train_loss_values)
else:
    iterations = range(1, len_train + 1)

# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(iterations, train_loss_values, label='Training Loss')
plt.plot(iterations, val_loss_values, label='Validation Loss')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

# Inference with GPT-NeoX

In [None]:
%%time
%cd {GPTNeoXDir}
# This has issues if used during training -  The server socket has failed to bind to [::]:29500 (errno: 98 - Address already
# This will write over the logs
!source {activate_script} && python ./deepy.py generate.py -d configs {gpt_neox_colabDir}/configs/shakespeare {gpt_neox_colabDir}/configs/shakespeare_gen
!cat sample_output.txt

In [None]:
# 2.21.0 was the last 2 series but it asks for trust_remote_code
!source {activate_script} &&  pip install datasets==2.14.6

In [None]:
%%time
# This has issues if used during training -  The server socket has failed to bind to [::]:29500 (errno: 98 - Address already
# This will write over the logs
# python ./deepy.py eval.py -d configs your_configs.yml --eval_tasks task1 task2 ... taskn
# NOTE this will prompt for permission to run a download script - would need an older datasetse library to avoid this
%cd {GPTNeoXDir}
!source {activate_script} && python ./deepy.py eval.py -d configs {gpt_neox_colabDir}/configs/shakespeare {gpt_neox_colabDir}/configs/shakespeare_gen --eval_tasks hellaswag
!cat sample_output.txt

# Inference with Hugging Face

## Convert model to HF format
Here we are converting our model to `HuggingFace Format`.

In [None]:
import os

# Define the path to the checkpoints directory
checkpoints_dir = f"{GPTNeoXDir}/checkpoints"

# Read the 'latest' file to get the latest checkpoint name
with open(os.path.join(checkpoints_dir, "latest"), "r") as f:
    latest_checkpoint_name = f.read().strip()

# Construct the full path to the latest checkpoint directory
latest_checkpoint_path = os.path.join(checkpoints_dir, latest_checkpoint_name)
print("Path to the latest checkpoint:", latest_checkpoint_path)


In [None]:
#@title Convert last checkpoint to huggingface model
%cd {GPTNeoXDir}
!source {activate_script} && python ./tools/ckpts/convert_neox_to_hf.py --input_dir {latest_checkpoint_path} --config_file {gpt_neox_colabDir}/configs/shakespeare.yml --output_dir {gpt_neox_colabDir}/data/shakespeare --architecture neox

## Generate Text

In [None]:
import sys
sys.path.insert(0, f"{gpt_neox_colabDir}/my_env/lib/python3.10/site-packages")

from transformers import GPTNeoXForCausalLM
import torch

# Move to model directory
%cd {gpt_neox_colabDir}

# Assuming CharLevelTokenizer is properly imported and instantiated
from src.gpt_neox_colab.CharLevelTokenizer import CharLevelTokenizer
tokenizer = CharLevelTokenizer(vocab_size=512)

# Load your model
model_path = f"{gpt_neox_colabDir}/data/shakespeare"
model = GPTNeoXForCausalLM.from_pretrained(model_path)

# Define a simple char-level tokenizer if not provided
def char_level_tokenize(text):
    return tokenizer.tokenize(text)

def char_level_detokenize(tokens):
    return tokenizer.detokenize(tokens)

# Set the model to evaluation mode
model.eval()

# Prompt the user for input
#input_text = input("Enter your prompt: ")
input_text = "Thou art"

# Tokenize and prepare input
input_ids = torch.tensor([char_level_tokenize(input_text)], dtype=torch.long)
attention_mask = torch.ones_like(input_ids)  # Create an attention mask for non-padded input

# Generate text with specified pad_token_id and attention_mask
with torch.no_grad():
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=200,          # Adjust this for desired output length
        temperature=0.7,        # Controls creativity
        top_k=50,               # Controls diversity
        top_p=0.9,              # Nucleus sampling
        num_return_sequences=1, # Number of sequences to return
        pad_token_id=model.config.eos_token_id,  # Set pad_token_id explicitly
        do_sample=True           # Enable sampling mode to use temperature and top_p
    )

# Decode and print the generated text
generated_text = char_level_detokenize(output[0].tolist())
print("Generated text:", generated_text)

In [None]:
from IPython.display import display, Javascript
import time

# Play the audio repeatedly
while True:
    display(Javascript('''
        new Audio("https://upload.wikimedia.org/wikipedia/commons/e/e6/Coins_dropped_in_metallic_moneybox_0.ogg").play()
    '''))
    time.sleep(30)  # Wait for 30 seconds before replaying

In [None]:
import IPython
# Autplay does not work in VSCode
IPython.display.Audio(filename=f"{gpt_neox_colabDir}/notebooks/beep-01a.mp3", autoplay=True)

In [None]:
import IPython
import numpy as np
fs = 16000.
# Autplay does not work in VSCode
IPython.display.Audio(np.sin(2*np.pi*440*np.arange(1 * fs)/fs), rate=fs, autoplay=True)

In [None]:
# Here we could disconnect from the Colab GPU resource but we will lose all results
#from google.colab import runtime
#runtime.unassign()

In [None]:
input_text = input("Enter your prompt: ")

# Tokenize and prepare input
input_ids = torch.tensor([char_level_tokenize(input_text)], dtype=torch.long)
attention_mask = torch.ones_like(input_ids)  # Create an attention mask for non-padded input

# Generate text with specified pad_token_id and attention_mask
with torch.no_grad():
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=200,          # Adjust this for desired output length
        temperature=0.7,        # Controls creativity
        top_k=50,               # Controls diversity
        top_p=0.9,              # Nucleus sampling
        num_return_sequences=1, # Number of sequences to return
        pad_token_id=model.config.eos_token_id,  # Set pad_token_id explicitly
        do_sample=True           # Enable sampling mode to use temperature and top_p
    )

# Decode and print the generated text
generated_text = char_level_detokenize(output[0].tolist())
print("Generated text:", generated_text)

In [44]:
%%time
def push_venv_to_dvc():
  if not USE_VENV:
    %cd {gpt_neox_colabDir}
    venv_dir = ".venv"
    venv_tar_file = f"{venv_dir}.tar"
    venv_gz_file = f"{venv_tar_file}.gz"
    if os.path.isdir(venv_dir):
      if not os.path.isfile(venv_tar_file):
        print(f"Tarring {venv_tar_file}")
        !tar -cf .venv.tar .venv
      if not os.path.isfile(venv_gz_file):
        print(f"Zipping {venv_tar_file}")
        !sudo apt-get install -y pigz
        !pigz -p 8 {venv_tar_file}
      print(f"Uploading {venv_gz_file} to DVC")
      !dvc add {venv_gz_file}
      !git add .venv.tar.gz.dvc
      !git commit -m "Add .venv.tar.gz.dvc to DVC"
      !dvc push -q
      !git push
push_venv_to_dvc()

/content/GPT-NeoX-Colab
Uploading .venv.tar.gz to DVC
[?25l[32m⠋[0m Checking graph
Computing md5 for a large file '/content/GPT-NeoX-Colab/.venv.tar.gz'. This is only done once.
Adding...:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
  0%|          |/content/GPT-NeoX-Colab/.venv.tar.g0.00/? [00:00<?,        ?B/s][A
  2% 58.0M/3.54G [00:00<00:06, 599MB/s{'info': ''}]                             [A
  3% 116M/3.54G [00:00<00:06, 604MB/s{'info': ''}] [A
  5% 174M/3.54G [00:00<00:05, 605MB/s{'info': ''}][A
  6% 232M/3.54G [00:00<00:05, 604MB/s{'info': ''}][A
  8% 290M/3.54G [00:00<00:05, 603MB/s{'info': ''}][A
 10% 348M/3.54G [00:00<00:05, 603MB/s{'info': ''}][A
 11% 406M/3.54G [00:00<00:05, 604MB/s{'info': ''}][A
 13% 464M/3.54G [00:00<00:05, 604MB/s{'info': ''}][A
 14% 522M/3.54G [00:00<00:05, 605MB/s{'info': ''}][A
 16% 580M/3.54G [00:01<00:05, 605MB/s{'info': ''}][A
 18% 638M/3.54G [00:01<00:05, 604MB/s{'info': ''}][A
 19% 696M/3.54G [00:01<00:05, 605MB/s{'info': ''}][A
