In [91]:
import torch
import torch.nn as nn
from datasets import load_from_disk
from tokenizers import Tokenizer

from chat_template import chat_template
from model import ChatModel
from settings import ModelSettings
from special_tokens import special_tokens

tokenizer = Tokenizer.from_file("tokenizer.json")
device = "cpu"


@torch.no_grad()
def generate(model, start, max_new_tokens=50, temperature=0.3, top_k=10, argmax=False, stop_tokens=None):
    idx = torch.tensor(
        [[tokenizer.token_to_id(special_tokens["bos"])]+ tokenizer.encode(start, add_special_tokens=False).ids],
        device=device,
        dtype=torch.long,
    )
    print(list(map(tokenizer.id_to_token,idx[0].tolist())))

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -ModelSettings.max_context_length:]
        logits = model(idx_cond)
        logits = logits[:, -1, :]

        top_logits, top_pos = torch.topk(logits, top_k)
        logits = torch.where(
            logits < top_logits[:, -1],
            input=torch.tensor(float("-inf")),
            other=logits
        )

        probs = nn.functional.softmax(logits / temperature, dim=-1)

        if argmax:
            next_id = torch.argmax(probs, dim=-1, keepdim=True)
        else:
            next_id = torch.multinomial(probs, num_samples=1)
        idx = torch.cat([idx, next_id], dim=1)

        if stop_tokens and next_id.item() in stop_tokens:
            print("Reached stop token")
            break

    return tokenizer.decode(idx[0].tolist())

In [92]:
minified = False

if not minified:
    model = ChatModel(
        vocabulary_size=ModelSettings.vocabulary_size,
        embedding_size=ModelSettings.embedding_size,
        max_context_length=ModelSettings.max_context_length,
        ff_size_multiplier=ModelSettings.ff_size_multiplier,
        transformer_blocks=ModelSettings.transformer_blocks,
        attention_heads=ModelSettings.attention_heads,
        dropout=ModelSettings.dropout,
        bias=ModelSettings.bias,
        device=device,
    )
else:
    model = ChatModel(
        vocabulary_size=ModelSettings.vocabulary_size,
        embedding_size=64,
        max_context_length=64,
        ff_size_multiplier=2,
        transformer_blocks=4,
        attention_heads=4,
        dropout=ModelSettings.dropout,
        bias=ModelSettings.bias,
        device=device,
    )

using flash attention
using flash attention
using flash attention
using flash attention
using flash attention
using flash attention
using flash attention
using flash attention
using flash attention
using flash attention
using flash attention
using flash attention


In [93]:
step = 5999
state = torch.load(f"instruction_output_colab/{step:05d}_with_dropout.pt", map_location=torch.device('cpu'))

In [94]:
state_dict = state["model"]
unwanted_prefix = '_orig_mod.'
for k, v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

In [95]:
model.load_state_dict(state["model"])

<All keys matched successfully>

In [96]:
stop_tokens = tokenizer.encode(special_tokens["end_of_turn"] + special_tokens["eos"], add_special_tokens=False).ids
print(stop_tokens)

[1, 3]


In [97]:
def chat_completion(user_text):
    return generate(
        model,
        chat_template([
            {"role": "user", "content": user_text}
        ],add_generation_token=True),
        max_new_tokens=200,
        stop_tokens=stop_tokens,
    )

In [98]:
data_dir = "tokenized_data"
test_ds_name = data_dir + "/test_chats"
ds_train = load_from_disk(test_ds_name).take(10).shuffle(0)
token_ids=next(iter(ds_train))["tokens"]
tokenizer.decode(token_ids)

'Write one paragraph about the Tlachihualtepetl pyramid. Keep it at or below 15 sentences. Include its size and when it was built. Also, include any aspects of worship it had. Include aspects of its more recent history, too. Name at least one specific year. \nWhen people hear the word "pyramid," they often think of Egypt. However, the world\'s largest pyramid is in Cholula, Mexico. Tlachihualtepetl, also called the Great Pyramid of Cholula, is 217 feet (66 meters) tall and its base is 1,476 by 1,1476 feet (450 by 450 meters). Its total volume is 157 million cubic feet. Due to its size, it took centuries to build. It was worked on several different times between 300 BC and 700 AD. There was a temple above it to worship Quetzalcoatl - the Aztec god of learning and books. However, the temple was abandoned around the 8th or 9th century AD. Over time, it became covered with vegetation. As a result, it blended in and Spaniards first mistook it for a hill when they conquered Mexico in 1521. T

In [99]:
# test
chat_completion("Write one paragraph about the Tlachihualtepetl pyramid.")

['<|bos|>', '<|user|>', 'W', 'rite', 'Ġone', 'Ġparagraph', 'Ġabout', 'Ġthe', 'ĠT', 'l', 'ach', 'ih', 'ual', 't', 'ep', 'et', 'l', 'Ġpyramid', '.', '<|endofturn|>', 'Ċ', '<|assistant|>']


'Write one paragraph about the Tlachihualtepetl pyramid.\nTlachihualtepetlmin. The tlachihialtepetepetl. The t-shutlach seepetl-epet-epet-epet-epetl-epet-epet-epet-epet-epet-l-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet-epet'

In [100]:
# train
chat_completion("I'm trying to create a menu with different kinds of pasta. Help me come up with different types of pasta and what they are best used for.")

['<|bos|>', '<|user|>', 'I', "'m", 'Ġtrying', 'Ġto', 'Ġcreate', 'Ġa', 'Ġmenu', 'Ġwith', 'Ġdifferent', 'Ġkinds', 'Ġof', 'Ġpasta', '.', 'ĠHelp', 'Ġme', 'Ġcome', 'Ġup', 'Ġwith', 'Ġdifferent', 'Ġtypes', 'Ġof', 'Ġpasta', 'Ġand', 'Ġwhat', 'Ġthey', 'Ġare', 'Ġbest', 'Ġused', 'Ġfor', '.', '<|endofturn|>', 'Ċ', '<|assistant|>']
Reached stop token


"I'm trying to create a menu with different kinds of pasta. Help me come up with different types of pasta and what they are best used for.\nHere are a few pasta that are best used for a pasta:\n1. Cauliflower. This pasta is great for pasta and can be used to make it a pasta-together recipe.\n2. Parmesan. This pasta is great for pasta and pasta.\n3. Parmesan. Parmesan is a pasta-together pasta."

In [101]:
# train
chat_completion("Write me an acrostic poem about cheese.")

['<|bos|>', '<|user|>', 'W', 'rite', 'Ġme', 'Ġan', 'Ġac', 'ro', 'stic', 'Ġpoem', 'Ġabout', 'Ġcheese', '.', '<|endofturn|>', 'Ċ', '<|assistant|>']


'Write me an acrostic poem about cheese.\nOh, I love,\nOh, I love,\nOh, how can I love you?\nOh, I love you,\nOh, I love you,\nOh, how can I love you?\nOh, I love you, you love you,\nOh, how can I love you,\nOh, how I love you,\nI love you, love you,\nOh, how love you,\nOh, how love you, how love you,\nOh, how love you,\nOh, how love you,\nOh, how love you,\nOh, how love you,\nOh, how love you, my love,\nOh, how love you,\nOh, how love you,\nOh, how love you,\nOh, how love you,\nOh, how love you,\nOh, how love, I love you,\nOh, how love you, how love you,\nOh, how love you,\nOh'

In [102]:
chat_completion("When did Michael Jackson pass away?")

['<|bos|>', '<|user|>', 'When', 'Ġdid', 'ĠMichael', 'ĠJackson', 'Ġpass', 'Ġaway', '?', '<|endofturn|>', 'Ċ', '<|assistant|>']
Reached stop token


"When did Michael Jackson pass away?\n Michael Jackson's first walked away from the United States on August 14, 1969. He was the second in the first place to walk away from the American States. He was the second to walk away from the country in the country and was the second to walk away from the country. He was also the second to walk away from the country in the country in the country."

I completed the available tutorials and built a minified GPT-2 clone.
I have issues with the result of the instruction training.
The resulting model can somewhat answer chat messages, but it's very inconfident.
It often answers something unrelated, or repeats the same words.
It can give a coherent answer only around 1 of 3 messages.
With dropout, the validation loss converges at ~4.25 and the training loss at ~3.2
Without dropout, the validation loss converges at ~5 and the training loss at ~0
What causes this? Too small model? Bad training?

The model:
- context length: 1000
- embedding dimensions: 384
- attention heads: 12
- transformer blocks: 12
- FFN width multilier: 4
- vocabulary size: 24k
- params: ~40M

Techniques:
- weight tying
- custom initialized weights
- custom optimizer config with gradient decay groups
- gradient clipping
- cuda optimizations
- temperature: 0.1, top_k: 10
- the chat inference stops when reaching EOS or end_of_turn tokens
- The chat inference adds the generation token

Pre-training:
- dropout: 0
- training tokens: 3B (too large)
- unique tokens: 1.5B
- learning rate schedule: peak 6e-4, min 6e-5
- batch size: 32
- BOS, EOS, end_of_text tokens
- The generated text here is pretty confident, but the repeating still appears sometimes

Instruction training:
- dropout: 0.1
- training chats: 5 * 9500
- unique chats: 9500
- learning rate: fixed, 1e-5
- batch size: 8 (smaller, because it was trained on a different GPU)
- I checked the training data, loss masking, attention masking, special tokens


Examples chats:

- User: When did Michael Jackson pass away?
- Assistant: Michael Jackson's first walked away from the United States on August 14, 1969. He was the second in the first place to walk away from the American States. He was the second to walk away from the country in the country and was the second to walk away from the country. He was also the second to walk away from the country in the country in the country.

- User: Write me an acrostic poem about cheese.
- Assistant: Oh, I love,\nOh, I love,\nOh, how can I love you?\nOh, I love you,\nOh, I love you,\nOh, how can I love you?\nOh, I love you, you love you,\nOh, how can I love you,\nOh, how I love you,\nI love you, love you,\nOh, how love you,\nOh, how love you, how love you,\nOh, how love you,\nOh, how love you,\nOh, how love you,\nOh, how love you,\nOh, how love you, my love,\nOh, how love you,\nOh, how love you,\nOh, how love you,\nOh, how love you,\nOh, how love you,\nOh, how love, I love you,\nOh, how love you, how love you,\nOh, how love you,\nOh

- User: I'm trying to create a menu with different kinds of pasta. Help me come up with different types of pasta and what they are best used for.
- Assistant: Here are a few pasta that are best used for a pasta:\n1. Cauliflower. This pasta is great for pasta and can be used to make it a pasta-together recipe.\n2. Parmesan. This pasta is great for pasta and pasta.\n3. Parmesan. Parmesan is a pasta-together pasta.