In [None]:
# ! pip install pyarrow  matplotlib sentencepiece pandas

In [17]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torch.nn import TransformerDecoder, TransformerDecoderLayer
import sentencepiece as spm
import matplotlib.pyplot as plt
import multiprocessing
import time
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

if torch.backends.mps.is_available():  # Check for Apple Silicon GPU availability (requires PyTorch 1.12 or later)
    device = torch.device("mps")
elif torch.cuda.is_available():  # Check for NVIDIA GPU availability
    device = torch.device("cuda")
else:
    device = torch.device("cpu")  # Fall back to CPU

print(f"Using device: {device}")

Using device: mps


train sentencepeace

In [None]:
# allfiles = [
#     '../train/0000.parquet',
#     # '../train/0001.parquet',
#     # '../train/0002.parquet',
#     # '../train/0003.parquet',
#     # '../validate/0000.parquet',
#     ]

# text_column = 'text'

# # Initialize an empty list to store text
# all_text = []

# for file in allfiles:
#     # Read the parquet file
#     df = pd.read_parquet(file)
#     # Append the text data to the list
#     all_text.extend(df[text_column].tolist())

# # Optional: Save all text to a single file if preferred
# with open('0000_text.txt', 'w', encoding='utf-8') as f:
#     for line in all_text:
#         f.write(f"{line}\n")

In [None]:
# Train SentencePiece model directly from Python list of sentences
# spm.SentencePieceTrainer.train(input='all_text.txt', model_prefix='spm_full_text_model', vocab_size=10000, model_type='unigram')
# spm.SentencePieceTrainer.train(input='0000_text.txt', model_prefix='spm_0000_text_model', vocab_size=10000, model_type='unigram')

Read Data

In [2]:
train = pd.read_parquet('../train/0000.parquet')
# Select only the first 10000 rows
train = train.iloc[:10000]

validate = pd.read_parquet('../validate/0000.parquet')

Dataset

In [3]:
class StoryDataset(Dataset):
    def __init__(self, stories, sp_model_path):
        self.stories = stories
        # print(self.stories)
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(sp_model_path)

    def __len__(self):
        return len(self.stories)

    def __getitem__(self, idx):
        encoded_story = [11000] + self.sp.EncodeAsIds(self.stories[idx])
        encoded_target = self.sp.EncodeAsIds(self.stories[idx]) + [12000]
        return torch.tensor(encoded_story, dtype=torch.long), torch.tensor(encoded_target, dtype=torch.long)

Padding function

In [4]:
def collate_fn(batch):
    stories, targets = zip(*batch)
    
    # Padding sequences to have the same length within a batch
    padded_stories = pad_sequence(stories, batch_first=True, padding_value=13000)  # Assuming 0 is your padding ID
    padded_targets = pad_sequence(targets, batch_first=True, padding_value=13000)  # Adjust padding_value if necessary
    
    return padded_stories, padded_targets

Model

In [5]:
Once upon a time Lilly rectangle.

SimpleTransformer(
  (embed): Embedding(13001, 256)
  (transformer_block): TransformerBlock(
    (attention): SelfAttention(
      (queries): Linear(in_features=256, out_features=256, bias=True)
      (keys): Linear(in_features=256, out_features=256, bias=True)
      (values): Linear(in_features=256, out_features=256, bias=True)
    )
    (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (ff): Sequential(
      (0): Linear(in_features=256, out_features=1024, bias=True)
      (1): ReLU()
      (2): Linear(in_features=1024, out_features=256, bias=True)
    )
    (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (fc_out): Linear(in_features=256, out_features=13001, bias=True)
)

In [6]:
def masked_cross_entropy(logits, target, mask):
    logits_flat = logits.view(-1, logits.size(-1))
    target_flat = target.view(-1)
    mask_flat = mask.view(-1)
    
    losses = F.cross_entropy(logits_flat, target_flat, reduction='none')
    masked_losses = losses * mask_flat
    return masked_losses.sum() / mask_flat.sum()

In [7]:
torch.cuda.empty_cache()
BATCH_SIZE = 32
NUM_WORKERS = 8  # Based on your system's capabilities

# Assuming `train_stories`, `train_targets` are your training data and targets
train_dataset = StoryDataset(train['text'], "spm_0000_text_model.model")
train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    collate_fn=collate_fn
    # num_workers=NUM_WORKERS
)

for batch_idx, (stories, targets) in enumerate(train_loader):
    print(stories.shape)
    print(targets.shape)
    break

torch.Size([32, 409])
torch.Size([32, 409])


Training 

In [8]:
# Assuming you have defined a model instance called `model`
# Setting a custom starting learning rate for the Adam optimizer
learning_rate = 1e-4  # Example: A smaller learning rate
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()  # Adjust based on your specific task

EPOCHS = 1  # Number of epochs to train for

model.train()  # Set the model to training mode
for epoch in range(EPOCHS):
    for batch_idx, (stories, targets) in enumerate(train_loader):
        stories = stories.to(device)
        targets = targets.to(device)
        
        mask = (targets != 13000).float().to(device)
        
        optimizer.zero_grad()  # Zero the gradients
        output = model(stories)  # Forward pass: compute the output
        
        # print(output.shape)
        # print(targets.shape)

        # Reshape outputs and targets to fit CrossEntropyLoss expectations
        # Flatten the output and targets to pass into the custom loss function
        # output_flat = output.view(-1, output.size(-1))  # Shape: [batch_size * seq_len, vocab_size]
        # targets_flat = targets.view(-1)  # Shape: [batch_size * seq_len]
        # mask_flat = mask.view(-1)  # Shape: [batch_size * seq_len]

        # Calculate the loss using the custom masked cross-entropy function
        # loss = masked_cross_entropy(output_flat, targets_flat, mask_flat)
        loss = masked_cross_entropy(output, targets, mask)

        loss.backward()  # Backward pass: compute gradient of the loss with respect to model parameters
        optimizer.step()  # Perform a single optimization step (parameter update)
        # print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item()}')
        # break
        if batch_idx % 10 == 0:  # Print loss every 100 batches
            print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item()}')
    break
    # Validation step
    val_loss = validate(model, validation_loader, criterion)
    print(f'Epoch {epoch}, Training Loss: {loss.item()}, Validation Loss: {val_loss}')
    # Validation loop can go here
    # Remember to set model.eval() and torch.no_grad() during validation

Epoch: 0, Batch: 0, Loss: 9.628703117370605
Epoch: 0, Batch: 10, Loss: 9.276326179504395
Epoch: 0, Batch: 20, Loss: 8.86674690246582
Epoch: 0, Batch: 30, Loss: 8.418627738952637
Epoch: 0, Batch: 40, Loss: 7.980021953582764
Epoch: 0, Batch: 50, Loss: 7.483315944671631
Epoch: 0, Batch: 60, Loss: 7.119735240936279
Epoch: 0, Batch: 70, Loss: 6.808167457580566
Epoch: 0, Batch: 80, Loss: 6.437288284301758
Epoch: 0, Batch: 90, Loss: 6.384188652038574
Epoch: 0, Batch: 100, Loss: 6.164973258972168
Epoch: 0, Batch: 110, Loss: 5.97415828704834
Epoch: 0, Batch: 120, Loss: 5.946849346160889
Epoch: 0, Batch: 130, Loss: 5.830953121185303
Epoch: 0, Batch: 140, Loss: 5.768563270568848
Epoch: 0, Batch: 150, Loss: 5.748751640319824
Epoch: 0, Batch: 160, Loss: 5.648336410522461
Epoch: 0, Batch: 170, Loss: 5.648838996887207
Epoch: 0, Batch: 180, Loss: 5.565451622009277
Epoch: 0, Batch: 190, Loss: 5.538675785064697
Epoch: 0, Batch: 200, Loss: 5.475164890289307
Epoch: 0, Batch: 210, Loss: 5.485043048858643
E

Final test

In [13]:
initial_text = "Once upon a time small"
sp = spm.SentencePieceProcessor()
sp.Load("spm_0000_text_model.model")
encoded_input = sp.EncodeAsIds(initial_text)
input_tensor = torch.tensor([encoded_input], dtype=torch.long).to(device)  # Assuming batch_size=1

In [14]:
max_length = 50  # Maximum length of the generated sequence
eos_token_id = 12000  # Assuming '</s>' is your end-of-sequence token

model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    for _ in range(max_length):
        output = model(input_tensor)  # Get the model's predictions
        # Get the last predicted token (logits) as the next token. Shape of output: (batch_size, sequence_length, vocab_size)
        last_token_logits = output[:, -1, :]
        predicted_token_id = torch.argmax(last_token_logits, dim=-1).unsqueeze(0)  # Choose the token with the highest probability
        # print(last_token_logits)
        # print(predicted_token_id)
        
        # Append the predicted token ID to the input (which is fed into the model in the next iteration)
        input_tensor = torch.cat((input_tensor, predicted_token_id), dim=1)
        
        # Check if the end-of-sequence token was generated
        if predicted_token_id.item() == eos_token_id:
            break

# Decode the generated sequence back to text
generated_sequence = [sp.IdToPiece(token_id) for token_id in input_tensor.squeeze().tolist()]
generated_text = ''.join(generated_sequence).replace('▁', ' ').strip()  # Assuming '▁' is the SentencePiece underline character
print(generated_text)


Once upon a time small the park. He was a time, "I. He was a time, "I. He was a time, "I. He was a time, "I. She was a time, "I. She was a time, "I
