In [2]:
pip install --quiet --no-cache-dir --ignore-installed -r requirements.txt

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
botocore 1.31.34 requires urllib3<1.27,>=1.25.4, but you have urllib3 2.1.0 which is incompatible.
fastai 2.7.12 requires torch<2.1,>=1.7, but you have torch 2.1.0+cu118 which is incompatible.
numba 0.57.1 requires numpy<1.25,>=1.21, but you have numpy 1.26.2 which is incompatible.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import time
import torch
import lightning as L
from torch.utils.data import DataLoader
from lightning.fabric.loggers import CSVLogger
from lightning.fabric.strategies import FSDPStrategy
from tsai_gpt.model import GPT, Block, Config
from tsai_gpt.tokenizer import Tokenizer
from tsai_gpt.packed_dataset import CombinedDataset, PackedDataset
from tsai_gpt.speed_monitor import SpeedMonitorBase, estimate_flops, measure_flops
from tsai_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor
from tsai_gpt.utils import chunked_cross_entropy, get_default_supported_precision, num_parameters, load_checkpoint, gptq_quantization
import torch.nn as nn
from pathlib import Path
import sys
import random
from torch import nn
import lightning.pytorch as pl
from torch.nn import functional as F



model_name = "pythia-160m"
name = "redpajama"

def _init_weights(module: nn.Module) -> None:
        """Meant to be used with `gpt.apply(gpt._init_weights)`."""
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

config = Config.from_name(model_name)
model = GPT(config)

next(model.parameters()).sum() #-25 -2 -860

model.apply(_init_weights)
model.load_state_dict


checkpoint_dir = Path("out/redpajama/final-gpt-model-ckpt.pth")
strategy = "auto"
quantize = None
devices = 1
precision = None


precision = get_default_supported_precision(training=False)
plugins = None
fabric = L.Fabric(devices=devices, precision=precision, strategy=strategy, plugins=plugins)
fabric.launch()
fabric.print(f"Loading model {str(checkpoint_dir)!r} with {config.__dict__}", file=sys.stderr)

with fabric.init_module(empty_init=True), gptq_quantization(quantize=="gptq.int4"):
    model = GPT(config)
    
model.eval()
model = fabric.setup_module(model)
load_checkpoint(fabric, model, checkpoint_dir)

tokenizer = Tokenizer(Path('tokenizer_config'))

@torch.inference_mode()
def generate(
    model: GPT,
    idx: torch.Tensor,
    max_returned_tokens: int,
    *,
    temperature: float = 1.0,
    top_k:int = None,
    eos_id:int = None,
) -> torch.Tensor:
    """Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.

    The implementation of this function is modified from A. Karpathy's nanoGPT.

    Args:
        model: The model to use.
        idx: Tensor of shape (T) with indices of the prompt sequence.
        max_returned_tokens: The maximum number of tokens to return (given plus generated).
        temperature: Scales the predicted logits by 1 / temperature.
        top_k: If specified, only sample among the tokens with the k highest probabilities.
        eos_id: If specified, stop generating any more token once the <eos> token is triggered.
    """
    T = idx.size(0)
    assert max_returned_tokens > T
    if model.max_seq_length < max_returned_tokens - 1:
        # rolling the kv cache based on the `input_pos` value would be necessary. However, doing so would introduce a
        # data dependency on the `input_pos` tensor and impact model compilation. Since this setting is uncommon, we do
        # not support it to avoid negatively impacting the overall speed
        raise NotImplementedError(f"max_seq_length {model.max_seq_length} needs to be >= {max_returned_tokens - 1}")

    device, dtype = idx.device, idx.dtype
    # create an empty tensor of the expected final shape and fill in the current tokens
    empty = torch.empty(max_returned_tokens, dtype=dtype, device=device)
    empty[:T] = idx
    idx = empty
    input_pos = torch.arange(0, T, device=device)

    # generate up to a fixed number of tokens
    for _ in range(max_returned_tokens - T):
        x = idx.index_select(0, input_pos).view(1, -1)

        # forward
        logits = model(x, input_pos)
        logits = logits[0, -1] / temperature

        # optionally crop the logits to only the top k options
        if top_k is not None:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits = torch.where(logits < v[[-1]], -float("Inf"), logits)

        probs = torch.nn.functional.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1).to(dtype=dtype)

        # advance
        input_pos = input_pos[-1:] + 1

        # concatenate the new generation
        idx = idx.index_copy(0, input_pos, idx_next)

        # if <eos> token is triggered, return the output (stop generation)
        if idx_next == eos_id:
            return idx[:input_pos]  # include the EOS token

    return idx


device = 'cuda' if torch.cuda.is_available() else 'cpu'

def generate_dialogue(input_text, temperature=0.8, max_tokens=200, top_k=1):
    encoded = tokenizer.encode(input_text, device=fabric.device)
    max_returned_tokens = encoded.size(0) + max_tokens


    with fabric.init_tensor():
        # set the max_seq_length to limit the memory usage to what we need
        model.max_seq_length = max_returned_tokens


    with fabric.init_tensor():
        model.set_kv_cache(batch_size=1)

    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k)

    return(tokenizer.decode(y))




Loading model 'out/redpajama/final-gpt-model-ckpt.pth' with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m-deduped'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}


In [4]:
print(generate_dialogue("Hi, my name is ", 0.9, 300, 2))

Hi, my name is 12.
The following year is the 120th of 120.
The 12th year of 2021 is the first 120th of 12th year.
The 12th year is the 1st year of 12th and is 12th.
The 12th year is 12th and is 12th of 12th.
The 12th of 12th year is the 1st of the year.
The 12th year is the 12th of 12th year.
The 12nd year is the 12th year of 12th and 1st of 12th year.
The 12th of 12th year is the 12th of 12th of 12th year.
The first 12th of 20th year was 12th of 12th and 1st of 1st year.
The 12th year of 12th year is 12th of 12th and 12th of 12th year.
The 12th of 12th year is the 12th of 12th year of the year.
The 12th


In [15]:
print(generate_dialogue("New Britain", 0.8, 200, None))

New Britain's BUSA RCW, 90% of BICO, [FUL] 90% of KPVA, Common Corey, Philadelphia, 90% of the BS character, NL CHA, 02% of the GA, LR, 200, and Forb 30% of RefStringing, LS, 199, 199% of the ParWW, 2011, and LACH, GA, 41% of the NWU, 10% of the smarter of the Bring-K goes into the 1990' LWW, 1997, around 1991, and 2000, respectively. The Cory of the United States is now 1999, and the 1995% of SBV


In [18]:
print(generate_dialogue("Megalaemyia elsae ", 0.5, 300, None))

Megalaemyia elsae 1986 is an ozarious system.
For more information about the full range of 10000-196, please contact us.
The term of the 1970s is the 1970s.
The review of the 1970s is now a 1970s, which is open to 1970s.
The following is:
The result is the 1970s.
The 1970s is the first of the 1970s.
The 1970s is the 1970s, and the 1970s is 1970s.
The 1970s is 1970s.
The 1970s is 1970s.
The 1970s are available to the 1970s in the 1970s.
The 1970s are 1980s. The 1970s are 1970s, and 1970s.
The 1970s are 1970s. The 1970s was the 1970s.
The 1970s is 19


In [20]:
print(generate_dialogue("How to do Research ", 0.6, 300, None))

How to do Research 1,000 years.
By Julia, C. 14, 2018, we’ve been a total of 2018, 2018, 2018.
You’re here to check the top 2018, 2018.
Don’t have any other trend in the line, but I’ve always been a great deal for a few years, but I’ve been able to work together with my first time.
Because I’m not going to go, I’ll be sure if I’ve been seeing a weekend in the weekend, I’m grateful for my new 2018-00-00, I’m sure I’m a free time ago.
I’m not sure if I’m not going to go, but I’m just wrong.
So I’m just a lot of my 2018-00: 2018-00-00 0.00pm, 2018-00-20 100:00:00:00 0000: 00000: 00000: 0000: 0000-1000: 00000: 0
