In [1]:
! pip install pyarrow matplotlib sentencepiece pandas
import torch
import os
import sys
import numpy as np
import math
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torch.nn import TransformerDecoder, TransformerDecoderLayer
import sentencepiece as spm
import matplotlib.pyplot as plt
import multiprocessing
import time
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

if torch.backends.mps.is_available():  # Check for Apple Silicon GPU availability (requires PyTorch 1.12 or later)
    device = torch.device("mps")
elif torch.cuda.is_available():  # Check for NVIDIA GPU availability
    device = torch.device("cuda")
    torch.cuda.empty_cache()
else:
    device = torch.device("cpu")  # Fall back to CPU

print(f"Using device: {device}")

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Using device: cuda


Data Import

In [2]:
# Read data from both Parquet files
train_0 = pd.read_parquet('../train/0000.parquet')
train_1 = pd.read_parquet('../train/0001.parquet')
train_2 = pd.read_parquet('../train/0002.parquet')
train_3 = pd.read_parquet('../train/0003.parquet')

# Concatenate the two DataFrames into one
train = pd.concat([train_0, train_1, train_2, train_3], ignore_index=True)
# train['text'] = train['text'].str.slice(start=0, stop=1500)

# Filter out elements with less than 2000 characters
# train = train[train['text'].str.len() >= 1500]
# Reset the index of the filtered DataFrame and drop the old index
# Select only the first 10000 rows
train = train.iloc[:10000]
train = train.reset_index(drop=True)

print(train.shape)

sp = spm.SentencePieceProcessor()
sp.Load('spm_05_text_model.model')

(10000, 1)


True

Dataset

In [3]:
class StoryDataset(Dataset):
    def __init__(self, stories,  sp, input_tokens_amount, device):
        super().__init__()
        self.stories = stories
        self.sp = sp
        self.input_tokens_amount = input_tokens_amount
        self.device = device

    def __len__(self):
        return len(self.stories)

    def __getitem__(self, idx):
        # encoded_story = [sp.PieceToId('<s>')] + self.sp.EncodeAsIds(self.stories[idx])[:(self.input_tokens_amount-1)]
        # encoded_target = self.sp.EncodeAsIds(self.stories[idx])[:(self.input_tokens_amount-1)] + [sp.PieceToId('</s>')]
        encoded_story = [sp.PieceToId('<s>')] + self.sp.EncodeAsIds(self.stories[idx])
        encoded_target = self.sp.EncodeAsIds(self.stories[idx]) + [sp.PieceToId('</s>')]
        return torch.tensor(encoded_story, dtype=torch.long, device = self.device), torch.tensor(encoded_target, dtype=torch.long, device = self.device)

Model

In [4]:
class Head(nn.Module):
    def __init__(self, embed_size, head_size, dropout, device):
        super().__init__()
        self.head_size      = head_size
        self.embed_size     = embed_size
        self.device         = device
        
        self.Key   = nn.Linear(self.embed_size, self.head_size, bias=False, device = self.device) # Size: [embed_size x head_size]
        self.Query = nn.Linear(self.embed_size, self.head_size, bias=False, device = self.device) # Size: [embed_size x head_size] 
        self.Value = nn.Linear(self.embed_size, self.head_size, bias=False, device = self.device) # Size: [embed_size x head_size] 
        
        self.Dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask = None):
        batchSize, tokens, head_size = x.shape

        key   = self.Key(x)    # Size: [batchSize x tokens x head_size]
        query = self.Query(x)  # Size: [batchSize x tokens x head_size]
        value = self.Value(x)  # Size: [batchSize x tokens x head_size]

        tril = torch.tril(torch.ones(tokens, tokens, device = self.device))                 # Size: [tokens_amount x tokens_amount]. Diagonale ones left side only.                                                                      

        # Compute Attention scores ("Affinities")
        attention = query @ key.transpose(-2, -1) * head_size**0.5                          # [Batch Size x Tokens amount x head_size] @ [Batch Size x head_size x Tokens amount] --> [Batch Size x Tokens amount x Tokens amount]
       
        if mask is not None:
            # print("mask:" , mask.shape)
            attention = attention.masked_fill(mask == 0, float(-1e9))                     # Size: [batchSize x tokens x tokens]
            # print("attention:" , attention.shape)
            # print("mask:" , mask)
            
        
        attention = attention.masked_fill(tril[:tokens, :tokens] == 0, float(-1e9))       # Size: [batchSize x tokens x tokens]
        # print("attention1:" , attention)
        attention = F.softmax(attention, dim=-1)                                            # Size: [batchSize x tokens x tokens]
        # print("attention2:" , attention)
        attention = self.Dropout(attention)
        # print("attention3:" , attention)
        
        out = attention @ value                                                             # Size: [Batch Size x Tokens Amount x head_size]
        # print("out:" , out)
        return out                                                                          # Size: [Batch Size x Tokens Amount x head_size]
    
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, num_heads, head_size, dropout, device):
        super().__init__()  
        self.num_heads  = num_heads
        self.head_size  = head_size
        self.embed_size = embed_size 
        self.device     = device
        
        self.Heads = nn.ModuleList()
        for _ in range(num_heads):
            self.Heads.append(Head(self.embed_size, self.head_size, dropout, self.device)) # ModuleList Size: [num_heads]

        self.Projection = nn.Linear(self.embed_size, self.embed_size)    # Size: [Batch Size x Tokens Amount x embed_size]
        self.Dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        multiHead = torch.cat([head(x, mask) for head in self.Heads], dim=-1)  # Size: [Batch Size x Tokens Amount x embed_size]
        projection = self.Dropout(self.Projection(multiHead))            # Size: [Batch Size x Tokens Amount x embed_size]
        return projection                                                # Size: [Batch Size x Tokens Amount x embed_size]

class FeedForward(nn.Module):
    def __init__(self, embed_size, dropout):
        super().__init__()
        self.FeedForward = nn.Sequential(
            nn.Linear(embed_size, 4 * embed_size),  # Size: [Batch Size x Tokens Amount x embed_size]
            nn.ReLU(),
            nn.Linear(4 * embed_size, embed_size),  # Size: [Batch Size x Tokens Amount x embed_size]
            nn.Dropout(dropout)
        )
    
    def forward(self, attentions):
        return self.FeedForward(attentions)
    
class Block(nn.Module):
    def __init__(self,  embed_size, num_heads, head_size, dropout, device):
        super().__init__()
        self.embed_size = embed_size
        self.num_heads  = num_heads
        self.head_size  = head_size
        self.device     = device

        self.MultiAttentionHeads = MultiHeadAttention(self.embed_size, self.num_heads, self.head_size, dropout, self.device) # Size: [Batch Size x Tokens Amount x head_size]
        self.FeedForward         = FeedForward(self.embed_size, dropout)   # Size: [Batch Size x Tokens Amount x head_size]
        self.Ln1                 = nn.LayerNorm(self.embed_size)  # Size: [Batch Size x Tokens Amount x head_size]
        self.Ln2                 = nn.LayerNorm(self.embed_size)  # Size: [Batch Size x Tokens Amount x head_size]

    def forward(self, positionedEmbeddings, mask=None):
        attentions  = positionedEmbeddings + self.MultiAttentionHeads(self.Ln1(positionedEmbeddings), mask) # Size: [Batch Size x Tokens Amount x embed_size]. Apply MultiHead Attention Layer
        feedForward = attentions + self.FeedForward(self.Ln2(attentions))                             # Size: [Batch Size x Tokens Amount x embed_size]
        return feedForward                                                                  # Size: [Batch Size x Tokens Amount x embed_size]
        


class SimpleTransformer(nn.Module):
    def __init__(self, num_heads, embed_size, head_size, input_tokens_amount, vocab_size, dropout, device):
        super().__init__()
        self.device                 = device
        self.embed_size             = embed_size
        self.input_tokens_amount    = input_tokens_amount
        self.vocab_size             = vocab_size
        self.num_heads              = num_heads
        self.head_size              = head_size
       
        self.Embedding = torch.nn.Embedding(num_embeddings = self.vocab_size, embedding_dim = self.embed_size, device = self.device) # Size: [vocab_size x embed_size]

        self.Blocks = nn.ModuleList([
            Block(self.embed_size, self.num_heads, self.head_size, dropout, self.device) for _ in range(6)
        ])
        self.final_layer_norm = nn.LayerNorm(self.embed_size)
        self.LangModelHead  = nn.Linear(self.embed_size, self.vocab_size, device = self.device) # Size: [embed_size x vocab_size]

    
    def positionEncoding(self, input_tokens_amount):
        positionEncoding = torch.zeros(input_tokens_amount, self.embed_size, device = self.device)                                  # max length x embedding dimmensions equivalent to Size: [input_tokens_amount x embed_size]
        positions = torch.arange(0, input_tokens_amount, dtype=torch.float, device = self.device).unsqueeze(1)                      # Tensor [0, 1, 2,..., input_tokens_amount] -> [⋮] : rotated for each value in separate row of 1 column
        div_term = torch.exp(torch.arange(0, self.embed_size, 2, device = self.device).float() * (-math.log(10000.0) / self.embed_size)) # Tensor [0, 2, 4,..., embed_size] x (-math.log(10000.0) / self.embed_size) --> exponenta

        positionEncoding[:, 0::2] = torch.sin(positions * div_term)             # Size: [input_tokens_amount x embed_size], set the odd values (columns 1 and 3) 
        positionEncoding[:, 1::2] = torch.cos(positions * div_term)             # Size: [input_tokens_amount x embed_size], set the even values (columns 2 and 4) 
 
        return positionEncoding.unsqueeze(0)                                    # Size: [1 (for batch dim) x input_tokens_amount x embed_size]
     
    def forward(self, input):                                                   # Size: [Batch Size x Tokens Amount] - input
        batchSize, tokens    = input.shape

        # Creating a mask for padding tokens
        # padding_mask = (input != 0).unsqueeze(1).unsqueeze(2).to(device)        # `0` is the padding token id
        # padding_mask = (input != 0).unsqueeze(1).to(device)        # `0` is the padding token id

        padded_lable = (input != 0).float().to(device)
        padding_mask = padded_lable.unsqueeze(-1) @ padded_lable.unsqueeze(-2)
        
        embeddings           = self.Embedding(input)                            # Size: [Batch Size x Tokens Amount x embed_size]
        positionedEmbeddings = embeddings + self.positionEncoding(tokens)       # Size: [Batch Size x Tokens Amount x embed_size]
        
        blocks = positionedEmbeddings
        for block in self.Blocks:
            blocks = block(blocks, padding_mask)                                # Size: [Batch Size x Tokens Amount x embed_size]
        blocks = self.final_layer_norm(blocks)
        
        logits = self.LangModelHead(blocks)                                     # Size: [Batch Size x Tokens Amount x vocab_size]
        return logits                                                           # Size: [Batch Size x Tokens Amount x vocab_size]

Parametrs

In [12]:

batches = 16
learning_rate = 1e-4
dropout = 0.2
maxNewTokens = 200
input_tokens_amount = 128
vocab_size = sp.GetPieceSize()
embed_size = 512 # 512
num_heads = 8
head_size = int(embed_size / num_heads)
print("head_size: ", head_size)
if embed_size % num_heads != 0:
    print("embed_size Cannot be divided evenly by num_heads.")
    sys.exit()


head_size:  64


Modela and Dataloader

In [13]:
def collate_fn(batch):
    inputs, targets = zip(*batch)  # Unzip the batch into inputs and targets

    # Pad sequences so they are all the same length
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=0)

    return inputs_padded, targets_padded

In [14]:
model = SimpleTransformer(
    embed_size          = embed_size, 
    num_heads           = num_heads,
    head_size           = head_size, 
    input_tokens_amount = input_tokens_amount, 
    vocab_size          = vocab_size,
    dropout             = dropout,
    device              = device,
)
model = model.to(device)

sDataset = StoryDataset(train['text'], sp, input_tokens_amount, device)
sDataloader = DataLoader(sDataset, batch_size=batches, shuffle=True, collate_fn=collate_fn, )
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

Training Loop

In [None]:
epochs = 10
model.train()
for epoch in range(epochs):
    running_loss = 0.0
    for batch_idx, (stories, targets) in enumerate(sDataloader): # stories Size: [Batch Size x Tokens Amount], targets Size: [Batch Size x Tokens Amount]
        logits  = model(stories)                      # Size: [Batch Size x Tokens Amount x Vocab Size]
        batch, tokens, vocabs = logits.shape
        logits  = logits.view(batch * tokens, vocabs) # Size: [(Batch Size * Tokens Amount) x Vocab Size]
        targets = targets.view(batch * tokens)        # Size: [(Batch Size * Tokens Amount)]
        
        # Assuming outputs from the model and labels are already obtained
        mask = targets != 0  # Assuming -1 is used for padding in labels
        loss = F.cross_entropy(logits, targets, reduction='none')
        loss = loss * mask.view(batch * tokens).float()
        loss = loss.sum() / mask.sum()
        running_loss += loss.item()
        # loss = F.cross_entropy(logits, targets)
        optimizer.zero_grad(set_to_none = True)
        loss.backward()
        optimizer.step()
        
        if batch_idx % 30 == 0:  # Print loss every 100 batches
            print(f'[{epoch + 1}, {batch_idx}] Loss: {loss.item() :.3f} Rloss: {running_loss / 30:.3f}')
            running_loss = 0.0

print("finish")



[1, 1] Loss: 3.487 Rloss: 0.116
[1, 31] Loss: 3.517 Rloss: 3.568
[1, 61] Loss: 3.422 Rloss: 3.526
[1, 91] Loss: 3.524 Rloss: 3.499
[1, 121] Loss: 3.447 Rloss: 3.558
[1, 151] Loss: 3.504 Rloss: 3.602
[1, 181] Loss: 3.427 Rloss: 3.592
[1, 211] Loss: 3.453 Rloss: 3.553
[1, 241] Loss: 3.538 Rloss: 3.551
[1, 271] Loss: 3.515 Rloss: 3.544
[1, 301] Loss: 3.733 Rloss: 3.556
[1, 331] Loss: 3.587 Rloss: 3.585
[1, 361] Loss: 3.523 Rloss: 3.570
[1, 391] Loss: 3.540 Rloss: 3.526
[1, 421] Loss: 3.510 Rloss: 3.550
[1, 451] Loss: 3.509 Rloss: 3.544
[1, 481] Loss: 3.743 Rloss: 3.540


Generate Story

In [16]:
with torch.no_grad():
    model.eval()
    for J in range(5):
        startPhrase = "Once upon a time,"
        startTokensIds = sp.EncodeAsIds(startPhrase)[:input_tokens_amount]     # [int, ..., tokens_length] 
        startTokensTensor = torch.tensor(startTokensIds, dtype=torch.long, device = device).unsqueeze(0)   # [1 x int, ..., tokens_length] 
        finalStoryTokensIds = startTokensIds
    
        for i in range(maxNewTokens):
            lastTokensInSentence = startTokensTensor[:, -input_tokens_amount:]
            genLogits = model(lastTokensInSentence)                         # Size: [Batch Size x Tokens Amount x Vocab Size]
    
            # Let's focus only on last token in sequence
            genLogits = genLogits[:, -1, :]                                 # Size: [Batch Size x Vocab Size]  
            probabilities = F.softmax(genLogits, dim=-1)                    # Size: [Batch Size x Vocab Size], Probavilities of each word from vocab
            nextIdx = torch.multinomial(probabilities, num_samples = 1)     # Size: [Batch Size x 1]
    
            # apply running index to the running sequence 
            startTokensTensor = torch.cat((startTokensTensor, nextIdx), dim=1) # Size: [Batch Size x (Tokens Amount + 1)]
            finalStoryTokensIds.append(nextIdx.item())
            
            finalStoryTokens = []
            for tokenId in finalStoryTokensIds:
                finalStoryTokens.append(sp.IdToPiece(tokenId))
            
        finalStory = ''.join(finalStoryTokens).replace('▁', ' ').strip()  # Assuming '▁' is the SentencePiece underline character
        print("finalStory #", J, ": ", finalStory, "\n\n")

    model.train()

finalStory # 0 :  Once upon a time, there was a little boy named Timmy. Timmy loved to jump outside and the game. He was only three years old boy went for the forest. One day, Timmy's mother saw a toy car and his owner that he met a loud pirate said, it. He kept playing in his family. Timmy decided to the forest. Timmy got to catch it was so he lay the colors and started to wear that he could help him feel better that he remembered what is that they went on the cliff where he could not yours!" But then, even happier and they worked together. Timmy, Timmy was happy. When they had lots of the hole. His mommy explained that he agreed. Later being so surprised and passed. The end of the bunch he was very mild golf. The lion. Timmy and fell asleep. After a beautiful picture's ok. He had a earthquake quickly ran back to clean. After a Lee came that he was a really fun that he told him tight, the rake to do 


finalStory # 1 :  Once upon a time, there was a little girl named Lily. She loved t