<a href="https://colab.research.google.com/github/markNZed/GPT-NeoX-Colab/blob/main/notebooks/shakespeare_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Inference of a tiny Shakespeare LLM

In [1]:
from datetime import datetime
print("Current Date and Time:", datetime.now())

Current Date and Time: 2024-11-28 14:08:56.902769


In [2]:
# We could modify these paths to "stub" behavior for test/dev
workspaceDir = "/content"
gpt_neox_colabDirName = "GPT-NeoX-Colab"
gpt_neox_colabDir = f"{workspaceDir}/{gpt_neox_colabDirName}"

In [3]:
#!curl -LsSf https://astral.sh/uv/install.sh | sh

# Cloning Git Repos

In [4]:
#@title Clone GPT-NeoX-Colab
%%time
%cd {workspaceDir}
!git clone https://github.com/markNZed/GPT-NeoX-Colab.git
#%cd {gpt_neox_colabDir}
#!uv sync -q --dev
#!uv run pip install -q -e .

/content
Cloning into 'GPT-NeoX-Colab'...
remote: Enumerating objects: 1669, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 1669 (delta 8), reused 8 (delta 2), pack-reused 1637 (from 1)[K
Receiving objects: 100% (1669/1669), 18.38 MiB | 26.65 MiB/s, done.
Resolving deltas: 100% (977/977), done.
CPU times: user 63.1 ms, sys: 6.87 ms, total: 69.9 ms
Wall time: 3.63 s


In [5]:
# This runs python in the virtual environment
#%%bash -s "$gpt_neox_colabDir"
#source "$1/.venv/bin/activate"
#python -c "
#from dotenv import load_dotenv
#import os
#load_dotenv('$1/.env')
#import gpt_neox_colab
#gpt_neox_colab.utils.colab.fetch_data('data/shakespeare/model.tar.gz')
#"

In [12]:
%pip install -q dvc[s3] python-dotenv
%cd {gpt_neox_colabDir}
from dotenv import load_dotenv
import os
load_dotenv('.env')
!dvc pull -q {'data/shakespeare/model.tar.gz'}
%cd {gpt_neox_colabDir}/data/shakespeare
!tar -xzf model.tar.gz

/content/GPT-NeoX-Colab
[0m/content/GPT-NeoX-Colab/data/shakespeare


# Inference with Hugging Face

In [10]:
from transformers import GPTNeoXForCausalLM
import torch

# Move to model directory
%cd {gpt_neox_colabDir}

# Assuming CharLevelTokenizer is properly imported and instantiated
from src.gpt_neox_colab.CharLevelTokenizer import CharLevelTokenizer
tokenizer = CharLevelTokenizer(vocab_size=512)

# Load your model
model_path = "data/shakespeare"
model = GPTNeoXForCausalLM.from_pretrained(model_path)

# Define a simple char-level tokenizer if not provided
def char_level_tokenize(text):
    return tokenizer.tokenize(text)

def char_level_detokenize(tokens):
    return tokenizer.detokenize(tokens)

# Set the model to evaluation mode
model.eval()


/content/GPT-NeoX-Colab


GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(512, 256)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-3): 4 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=256, out_features=768, bias=True)
          (dense): Linear(in_features=256, out_features=256, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=256, out_features=1024, bias=True)
          (dense_4h_to_h): Linear(in_features=1024, out_features=256, bias=True)
          (

In [11]:
# Prompt the user for input
input_text = input("Enter your prompt: ")

# Tokenize and prepare input
input_ids = torch.tensor([tokenizer.tokenize(input_text)], dtype=torch.long)
attention_mask = torch.ones_like(input_ids)  # Create an attention mask for non-padded input

# Generate text with specified pad_token_id and attention_mask
with torch.no_grad():
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=200,          # Adjust this for desired output length
        temperature=0.7,        # Controls creativity
        top_k=50,               # Controls diversity
        top_p=0.9,              # Nucleus sampling
        num_return_sequences=1, # Number of sequences to return
        pad_token_id=model.config.eos_token_id,  # Set pad_token_id explicitly
        do_sample=True           # Enable sampling mode to use temperature and top_p
    )

# Decode and print the generated text
generated_text = tokenizer.detokenize(output[0].tolist())
print("Generated text:", generated_text)

Enter your prompt: test
Generated text: test of my with her so. I have the most of the To ding of deaths. And bitted for man! I did to our heads. For what we have: what with you. BRAKENBURY: I can the books in the most in a That has morned 
