# Training a tiny LLM on a corpus of Shakespeare 

<a href="https://colab.research.google.com/github/markNZed/GPT-NeoX-Colab/blob/venv/notebooks/shakespeare_trainingV2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from datetime import datetime
print("Current Date and Time:", datetime.now())

In [None]:
# We could modify these paths to "stub" behavior for test/dev
workspaceDir = "/content"
GPTNeoXDirName = "gpt-neox"
GPTNeoXDir = f"{workspaceDir}/{GPTNeoXDirName}"
GPTNeoXColabDirName = "GPT-NeoX-Colab"
GPTNeoXColabDir = f"{workspaceDir}/{GPTNeoXColabDirName}"

In [None]:
#@title SSH Connection for VS Code
# This can be used to connect to the Colab notebook through a local VS Code editor, which is useful for debugging
GPTNeoXColab.utils.colab.setup_ssh_connection(False)

# Cloning Git Repos

In [None]:
#@title Clone GPT-NeoX-Colab
%%time
%cd {workspaceDir}
# Don't use --depth 1 because that does not play nice with git-annex
!git clone -b venv https://github.com/markNZed/GPT-NeoX-Colab.git
%cd {GPTNeoXColabDir}
!pip install . > /dev/null 2>&1
from dotenv import load_dotenv
import os
load_dotenv(f"{GPTNeoXColabDir}/.env")
import GPTNeoXColab
GPTNeoXColab.utils.colab.install_git_annex()
GPTNeoXColab.utils.colab.enable_remote()
GPTNeoXColab.utils.colab.sync_annex()
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare.txt")
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare.jsonl")
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare_text_document.bin")
GPTNeoXColab.utils.colab.fetch_data("data/shakespeare/shakespeare_text_document.idx")

In [None]:
#@title Clone GPT-NeoX
%%time
%cd {workspaceDir}
!git clone --depth 1 https://github.com/EleutherAI/gpt-neox

# Downloading Environment
It is faster to download a Python virtual environment and unzip it than to install all the dependencies.

In [None]:
%%time
%cd {workspaceDir}
BACKBLAZE_SAVE = False
GPTNeoXColab.utils.colab.download_my_env(BACKBLAZE_SAVE)

# Installing Dependencies

In [None]:
%%time
%cd {workspaceDir}
import os
# Check if the directory does not exist
if not os.path.isdir(f"{workspaceDir}/my_env"):
    # Install venv package for Python 3.10
    !apt-get update && apt-get install -y python3.10-venv
    !pip install virtualenv
    # Create the virtual environment
    !python3 -m venv {workspaceDir}/my_env
    %cd {GPTNeoXDir}
    # Install specific versions of torch and other packages to avoid compatibility issues
    !source {workspaceDir}/my_env/bin/activate && pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18.0 transformers==4.41.0 sentence-transformers==2.2.2
    # Install dependencies
    !source {workspaceDir}/my_env/bin/activate && pip install -r ./requirements/requirements.txt
    !source {workspaceDir}/my_env/bin/activate && pip install -r ./requirements/requirements-tensorboard.txt

# Uploading Environment
If BACKBLAZE_SAVE is True then we will upload the Python virtual environment to Backblaze

In [26]:
GPTNeoXColab.utils.colab.upload_my_env(BACKBLAZE_SAVE)

# Preparing Custom Dataset

In [None]:
#@title Converting text data to jsonl format
import os

%cd {GPTNeoXDir}
!mkdir -p data

# Check if the converted file exists
if not os.path.isfile(f"{GPTNeoXColabDir}/data/shakespeare/shakespeare.jsonl"):
    GPTNeoXColab.utils.ml.text2jsonl(f"{GPTNeoXColabDir}/data/shakespeare/shakespeare.txt", f"{GPTNeoXColabDir}/data/shakespeare/shakespeare.jsonl")


!cp {GPTNeoXColabDir}/data/shakespeare/shakespeare.jsonl {GPTNeoXDir}/data/shakespeare/shakespeare.jsonl

# Tokenizing Dataset

In [None]:
#@title Tokenizing jsonl formatted data
%%time
import os

%cd {GPTNeoXDir}
!mkdir -p processed_data

# Check if the tokenized files exists
a = f"{GPTNeoXColabDir}/data/shakespeare/shakespeare_text_document.idx"
b = f"{GPTNeoXColabDir}/data/shakespeare/shakespeare_text_document.bin"
if not os.path.isfile(a) or not os.path.isfile(b):
    !source {workspaceDir}/my_env/bin/activate && python tools/datasets/preprocess_data.py \
        --input ./data/shakespeare/shakespeare.jsonl \
        --output-prefix ./processed_data/shakespeare/shakespeare \
        --tokenizer-type CharLevelTokenizer \
        --dataset-impl mmap \
        --append-eod

!cp {GPTNeoXColabDir}/data/shakespeare/shakespeare_text_document.bin {GPTNeoXDir}/data/shakespeare/
!cp {GPTNeoXColabDir}/data/shakespeare/shakespeare_text_document.idx {GPTNeoXDir}/data/shakespeare/

# Training

In [None]:
%load_ext tensorboard

In [None]:
#@title Run the training in a detached background process
import subprocess

# Start a detached background process
process = subprocess.Popen(
    f"nohup bash -c \"source {workspaceDir}/my_env/bin/activate && python ./deepy.py train.py --conf_dir {GPTNeoXColabDir}/configs shakespeare.yml shakespeare_deepy.yml\" & echo $! > train_process.pid",
    shell=True,
    executable='/bin/bash',
    preexec_fn=subprocess.os.setsid  # Starts the process in a new session so interrupting Notebook does not kill the training
)


In [None]:
#@title Wait until tensorboard log directory is created
import time
import os

# Path to the TensorBoard log directory
tensorboard_log_dir = f"{GPTNeoXDir}/tensorboard"

# Wait for the directory to be created
while not os.path.exists(tensorboard_log_dir):
    print("Waiting for TensorBoard log directory to be created...")
    time.sleep(10)  # Check every X seconds

print("TensorBoard log directory found. You can now launch TensorBoard.")

In [None]:
# Need to delete everything in checkpoints and tensorboard dir for a fresh run
%cd {GPTNeoXDir}
%tensorboard --logdir tensorboard

In [None]:
#@title Find the latest log file
import glob
import os

# Define the log directory and pattern for log files
log_dir = f"{GPTNeoXDir}/logs"
log_pattern = os.path.join(log_dir, "*_stdout.txt")

# Get the list of log files that match the pattern
log_files = glob.glob(log_pattern)

# Ensure there are log files in the directory
if log_files:
    # Find the latest log file based on modification time
    latest_log = max(log_files, key=os.path.getmtime)
    print("Latest log file:", latest_log)
else:
    latest_log = None
    print("No log files found.")


In [None]:
#@title Read the latest log file and extract the iteration count
import time
import os
import re

# File to store the last read position (persistence between script runs)
file_position = 0
# Regular expression to match "iteration <number> / <total>"
iteration_pattern = re.compile(r"iteration\s+(\d+)\s*/\s*\d+")

def read_new_iterations():
    global file_position
    # Open the log file and seek to the last position
    with open(latest_log, "r") as file:
        file.seek(file_position)
        # Read new lines
        new_lines = file.readlines()
        file_position = file.tell()
        # Process lines containing "iteration"
        last_match = None
        for line in new_lines:
            match = iteration_pattern.search(line)
            if match:
                last_match = match
        if last_match:
            # Extract the iteration count from the regex match
            iteration_count = int(last_match.group(1))
            print(f"{iteration_count} iterations")

# Read the PID from the file
with open("train_process.pid", "r") as f:
    pid = int(f.read().strip())
    print("PID:", pid)

# Function to check if the process is running
def is_process_running(pid):
    try:
        os.kill(pid, 0)  # Sending signal 0 to check if the process exists
        return True
    except OSError:
        return False

# Monitor the training process
while is_process_running(pid):
    read_new_iterations()
    print("Training is still running...")
    time.sleep(30)  # Check every X seconds

print("Training has finished.")


In [None]:
#@title Wait until checkpoints directory is created

import time
import os

# Path to the checkpoints directory
checkpoints_dir = f"{GPTNeoXDir}/checkpoints"

# Wait for the directory to be created
while not os.path.exists(checkpoints_dir):
    print("Waiting for checkpoints directory to be created...")
    time.sleep(10)  # Check every X seconds

print("Checkpoints directory found.")


In [None]:
#@title Dispaly training and validation Loss
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorboard.backend.event_processing import event_accumulator
import os
import numpy as np
# Path to the latest log file
log_dir = "tensorboard"
log_files = [os.path.join(log_dir, d) for d in os.listdir(log_dir)]
latest_log_dir = max(log_files, key=os.path.getmtime)

# Initialize EventAccumulator to load scalar data
ea = event_accumulator.EventAccumulator(latest_log_dir)
ea.Reload()  # Load all logs

# List all scalar keys available in the logs
scalar_keys = ea.Tags()['scalars']
print("Available scalar keys:", scalar_keys)

# Extract training and validation losses
train_loss = ea.Scalars('train/lm_loss')  # Adjust for actual name if necessary
val_loss = ea.Scalars('validation/lm_loss')  # Adjust for actual name if necessary

# Convert to lists for plotting
train_loss_values = [x.value for x in train_loss]
val_loss_values = [x.value for x in val_loss]

# Find the lengths of both arrays
len_train = len(train_loss_values)
len_val = len(val_loss_values)

iterations = None
# Interpolate the shorter array
if len_train != len_val:
    if len_train > len_val:
        # Interpolate validation loss to match the training loss length
        iterations = np.linspace(1, len_train, len_train)
        val_iterations = np.linspace(1, len_train, len_val)
        val_loss_values = np.interp(iterations, val_iterations, val_loss_values)
    else:
        # Interpolate training loss to match the validation loss length
        iterations = np.linspace(1, len_val, len_val)
        train_iterations = np.linspace(1, len_val, len_train)
        train_loss_values = np.interp(iterations, train_iterations, train_loss_values)
else:
    iterations = range(1, len_train + 1)

# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(iterations, train_loss_values, label='Training Loss')
plt.plot(iterations, val_loss_values, label='Validation Loss')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

# Inference with GPT-NeoX

In [None]:
%%time
# This has issues if used during training -  The server socket has failed to bind to [::]:29500 (errno: 98 - Address already
# This will write over the logs
!source {workspaceDir}/my_env/bin/activate && python ./deepy.py generate.py -d configs {GPTNeoXColabDir}/configs/shakespeare.yml {GPTNeoXColabDir}/configs/shakespeare_gen.yml > /dev/null
!cat sample_output.txt

# Inference with Hugging Face

## Convert model to HF format
Here we are converting our model to `HuggingFace Format`.

In [None]:
import os

# Define the path to the checkpoints directory
checkpoints_dir = f"{GPTNeoXDir}/checkpoints"

# Read the 'latest' file to get the latest checkpoint name
with open(os.path.join(checkpoints_dir, "latest"), "r") as f:
    latest_checkpoint_name = f.read().strip()

# Construct the full path to the latest checkpoint directory
latest_checkpoint_path = os.path.join(checkpoints_dir, latest_checkpoint_name)
print("Path to the latest checkpoint:", latest_checkpoint_path)


In [None]:
#@title Convert last checkpoint to huggingface model
!source {workspaceDir}/my_env/bin/activate && python ./tools/ckpts/convert_neox_to_hf.py --input_dir {latest_checkpoint_path} --config_file {GPTNeoXColabDir}/configs/shakespeare.yml --output_dir {GPTNeoXColabDir}/data/shakespeare --precision auto --architecture neox

## Generate Text

In [None]:
from transformers import GPTNeoXForCausalLM
import torch

# Move to model directory
%cd {GPTNeoXDir}

# Assuming CharLevelTokenizer is properly imported and instantiated
from GPTNeoXColab import CharLevelTokenizer
tokenizer = CharLevelTokenizer.CharLevelTokenizer(vocab_size=512)

# Load your model
model_path = f"{GPTNeoXColabDir}/data/shakespeare"
model = GPTNeoXForCausalLM.from_pretrained(model_path)

# Define a simple char-level tokenizer if not provided
def char_level_tokenize(text):
    return tokenizer.tokenize(text)

def char_level_detokenize(tokens):
    return tokenizer.detokenize(tokens)

# Set the model to evaluation mode
model.eval()

# Prompt the user for input
#input_text = input("Enter your prompt: ")
input_text = "Thou art"

# Tokenize and prepare input
input_ids = torch.tensor([char_level_tokenize(input_text)], dtype=torch.long)
attention_mask = torch.ones_like(input_ids)  # Create an attention mask for non-padded input

# Generate text with specified pad_token_id and attention_mask
with torch.no_grad():
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=200,          # Adjust this for desired output length
        temperature=0.7,        # Controls creativity
        top_k=50,               # Controls diversity
        top_p=0.9,              # Nucleus sampling
        num_return_sequences=1, # Number of sequences to return
        pad_token_id=model.config.eos_token_id,  # Set pad_token_id explicitly
        do_sample=True           # Enable sampling mode to use temperature and top_p
    )

# Decode and print the generated text
generated_text = char_level_detokenize(output[0].tolist())
print("Generated text:", generated_text)

In [None]:
# Here we could disconnect from the GPU resource
#from google.colab import runtime
#runtime.unassign()