<a href="https://colab.research.google.com/github/markNZed/GPT-NeoX-Colab/blob/main/notebooks/shakespeare_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training a tiny langauge model (TLM) on a corpus of Shakespeare
The intention of this notebook is to demonstrate a basic setup of GPT-NeoX for experimenting with a TLM.

In [None]:
# We could modify these paths to "stub" behavior for test/dev
workspaceDir = "/content"
gpt_neox_colabDir = f"{workspaceDir}/GPT-NeoX-Colab"
GPTNeoXDirName = "gpt-neox"
GPTNeoXDir = f"{workspaceDir}/{GPTNeoXDirName}"

# Cloning Git Repos

In [None]:
!git clone --depth 1 https://github.com/markNZed/GPT-NeoX-Colab.git

In [None]:
%%time
#@title Clone GPT-NeoX
%cd {workspaceDir}
!git clone -b pipe_parallel_size_1 --depth 1 https://github.com/markNZed/gpt-neox.git

In [None]:
%pip install -q torch==2.3 torchaudio==2.3.0 torchvision==0.18.0 transformers==4.38.0 sentence-transformers==2.2.2
%pip install -q fsspec==2024.2.0 datasets==2.18.0 evaluate==0.4.3 lm-eval==0.4.1 tensorboard==2.17.1 tensorflow==2.17.1
%cd {GPTNeoXDir}
%pip install -q -r ./requirements/requirements.txt

# Preparing Dataset

In [None]:
#@title Converting text data to jsonl format
%cd {GPTNeoXDir}
!mkdir -p data

import json
input_txt_file = f"{gpt_neox_colabDir}/notebooks/shakespeare.txt"
output_jsonl_file = f"{GPTNeoXDir}/data/shakespeare.jsonl"

lines = []
with open(input_txt_file, encoding="utf8") as f:
    for line in f.read().splitlines():
        if line:
            lines.append({"text": line})
json_lines = [json.dumps(data) for data in lines]
with open(output_jsonl_file, "w") as f:
    f.write("\n".join(json_lines))

# Tokenizing Dataset

In [None]:
%%time
#@title Tokenizing jsonl formatted data
import os

%cd {GPTNeoXDir}
!mkdir -p processed_data
%cd processed_data
cmd = f"""
python {GPTNeoXDir}/tools/datasets/preprocess_data.py \
    --input {GPTNeoXDir}/data/shakespeare.jsonl \
    --output-prefix shakespeare \
    --tokenizer-type CharLevelTokenizer \
    --dataset-impl mmap \
    --append-eod
"""
print(f"Command: {cmd}")
!{cmd}


# Training

In [None]:
%cd {GPTNeoXDir}
!python ./deepy.py train.py --conf_dir {gpt_neox_colabDir}/configs shakespeare shakespeare_deepy

/content/gpt-neox
[2024-12-03 14:07:34,504] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
NeoXArgs.from_ymls() ['/content/GPT-NeoX-Colab/configs/shakespeare.yml', '/content/GPT-NeoX-Colab/configs/shakespeare_deepy.yml']
INFO:root:NeoXArgs.calculate_derived() Total number of GPUs determined to be: 1
-------------------- arguments --------------------
  attention_config ................ ['global', 'global', 'global', 'global']updated
  batch_size ...................... 256.........................updated
  checkpoint_factor ............... 50..........................updated
  data_impl ....................... mmap........................updated
  data_path ....................... processed_data/shakespeare_text_documentupdated
  deepspeed_extra_args ............ {'comms_logger': {'enabled': False, 'verbose': False, 'prof_all': False, 'debug': False}}updated
  dynamic_loss_scale .............. True........................updated
  eval_inte

# Inference

In [26]:
%%time
%cd {GPTNeoXDir}
# This has issues if used during training -  The server socket has failed to bind to [::]:29500 (errno: 98 - Address already
# This will write over the logs
!python ./deepy.py generate.py -d configs {gpt_neox_colabDir}/configs/shakespeare {gpt_neox_colabDir}/configs/shakespeare_gen
!cat sample_output.txt

/content/gpt-neox
[2024-12-03 13:22:43,891] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
NeoXArgs.from_ymls() ['/content/GPT-NeoX-Colab/configs/shakespeare.yml', '/content/GPT-NeoX-Colab/configs/shakespeare_gen.yml']
INFO:root:NeoXArgs.calculate_derived() Total number of GPUs determined to be: 1
-------------------- arguments --------------------
  attention_config ................ ['global', 'global', 'global', 'global']updated
  batch_size ...................... 256.........................updated
  checkpoint_factor ............... 50..........................updated
  config_files .................... {'shakespeare.yml': '{\n  "pipe_parallel_size": 0, # Because running on one GPU\n  "model_parallel_size": 1, # Because running on one GPU\n\n  # model settings\n  "num_layers": 4,\n  "hidden_size": 256,\n  "num_attention_heads": 4,\n  "seq_length": 512,\n  "max_position_embeddings": 512,\n  "pos_emb": "rotary",\n  "no_weight_tying": fal