<a href="https://colab.research.google.com/github/markNZed/GPT-NeoX-Colab/blob/main/notebooks/shakespeare_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training a tiny SLM on a corpus of Shakespeare
The intention of this notebook is to demonstrate a basic setup for experimenting with a tiny SLM.

In [None]:
# We could modify these paths to "stub" behavior for test/dev
workspaceDir = "/content"
gpt_neox_colabDir = f"{workspaceDir}/GPT-NeoX-Colab"
GPTNeoXDirName = "gpt-neox"
GPTNeoXDir = f"{workspaceDir}/{GPTNeoXDirName}"

# Cloning Git Repos

In [None]:
!git clone --depth 1 https://github.com/markNZed/GPT-NeoX-Colab.git

In [None]:
%%time
#@title Clone GPT-NeoX
%cd {workspaceDir}
!git clone -b pipe_parallel_size_1 --depth 1 https://github.com/markNZed/gpt-neox.git

/content
fatal: destination path 'gpt-neox' already exists and is not an empty directory.
CPU times: user 7.02 ms, sys: 0 ns, total: 7.02 ms
Wall time: 106 ms


In [None]:
%pip install -q torch==2.3 torchaudio==2.3.0 torchvision==0.18.0 transformers==4.38.0 sentence-transformers==2.2.2
%pip install -q fsspec==2024.2.0 datasets==2.14.0 evaluate==0.4.3 lm-eval==0.4.1 tensorboard==2.17.1 tensorflow==2.17.1
%cd {GPTNeoXDir}
%pip install -q -r ./requirements/requirements.txt

/content/gpt-neox
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for mpi4py (pyproject.toml) ... [?25l[?25hdone


# Preparing Dataset

In [None]:
#@title Converting text data to jsonl format
!source {activate_script} && python -c "import gpt_neox_colab.utils; gpt_neox_colab.utils.ml.text2jsonl(\"{gpt_neox_colabDir}/data/shakespeare/shakespeare.txt\", \"{gpt_neox_colabDir}/data/shakespeare/shakespeare.jsonl\")"
!cp {gpt_neox_colabDir}/data/shakespeare/shakespeare.jsonl {GPTNeoXDir}/data/shakespeare.jsonl

/content/gpt-neox


# Tokenizing Dataset

In [None]:
%%time
#@title Tokenizing jsonl formatted data
import os

%cd {GPTNeoXDir}
!mkdir -p processed_data
%cd processed_data
cmd = f"""
source {activate_script} && python {GPTNeoXDir}/tools/datasets/preprocess_data.py \
    --input {GPTNeoXDir}/data/shakespeare.jsonl \
    --output-prefix shakespeare \
    --tokenizer-type CharLevelTokenizer \
    --dataset-impl mmap \
    --append-eod
"""
print(f"Command: {cmd}")
!source {activate_script} && python {GPTNeoXDir}/tools/datasets/preprocess_data.py \
    --input {GPTNeoXDir}/data/shakespeare.jsonl \
    --output-prefix shakespeare \
    --tokenizer-type CharLevelTokenizer \
    --dataset-impl mmap \
    --append-eod
!cp {GPTNeoXDir}/processed_data/shakespeare_text_document.bin {GPTNeoXDir}/processed_data
!cp {GPTNeoXDir}/processed_data/shakespeare_text_document.idx {GPTNeoXDir}/processed_data

/content/gpt-neox
/content/gpt-neox/processed_data
CPU times: user 15.2 ms, sys: 1.2 ms, total: 16.4 ms
Wall time: 312 ms


# Training

In [None]:
%cd {GPTNeoXDir}
!python ./deepy.py train.py --conf_dir {gpt_neox_colabDir}/configs shakespeare shakespeare_deepy

Running command: 
nohup nohup bash -c " source /content/GPT-NeoX-Colab/.venv/bin/activate && cd /content/gpt-neox && python ./deepy.py train.py --conf_dir /content/GPT-NeoX-Colab/configs shakespeare shakespeare_deepy "

Started training with PID: 4824


# Inference

In [None]:
%%time
%cd {GPTNeoXDir}
# This has issues if used during training -  The server socket has failed to bind to [::]:29500 (errno: 98 - Address already
# This will write over the logs
!source {activate_script} && python ./deepy.py generate.py -d configs {gpt_neox_colabDir}/configs/shakespeare {gpt_neox_colabDir}/configs/shakespeare_gen
!cat sample_output.txt