### Create Speakleash object

In [1]:
from speakleash import Speakleash

sl = Speakleash("./speakleash_data")

### Create training dataset

In [2]:
CHAR_LIMIT = 10_000_000 # How many characters (at most) include in the training data
DATASET_NAME = "wolne_lektury_corpus" # Name of the dataset


d = sl.get(DATASET_NAME)
char_count = 0

with open(f"data/train.txt", "w", encoding="utf-8") as f_out:
    for doc in d.data:
        f_out.write(doc + "\n")
        char_count += len(doc)
        
        if char_count >= CHAR_LIMIT:
            print(f"Reached limit! Wrote {char_count} characters.")
            break


Reached limit! Wrote 10251040 characters.


### Download a polish tokenizer and polish embeddings

In [3]:
from transformers import AutoTokenizer, AutoModel
from copy import deepcopy

tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
model = AutoModel.from_pretrained("allegro/herbert-base-cased")

embedding = deepcopy(model.embeddings.word_embeddings)
del model


Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
text = "ALA ma kota"
tokens = tokenizer.encode(text)
decoded = tokenizer.decode(tokens)

In [5]:
text, tokens, decoded

('ALA ma kota', [0, 34932, 2185, 24112, 2], '<s>ALA ma kota </s>')

In [6]:
from torch.utils.data import Dataset, DataLoader
import torch

class PolishTokenDataset(Dataset):
    def __init__(self, text: str, tokenizer: AutoTokenizer, seq_len=128):
        self.tokenizer = tokenizer
        self.seq_len = seq_len

        self.tokens = tokenizer.encode(text, add_special_tokens=False)
        self.num_chunks = len(self.tokens) // seq_len

    def __len__(self):
        return self.num_chunks

    def __getitem__(self, idx):
        start = idx * self.seq_len
        end = start + self.seq_len + 1
        chunk = self.tokens[start:end]

        if len(chunk) < self.seq_len + 1:
            pad_id = self.tokenizer.pad_token_id
            chunk = chunk + [pad_id] * (self.seq_len + 1 - len(chunk))

        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y

### Read training text

In [7]:
with open("data/train.txt", encoding="utf-8") as f:
    text = f.read()

### Create Dataloader

In [8]:
CONTEXT_LENGTH = 128
BATCH_SIZE = 4

dataset = PolishTokenDataset(text, tokenizer, seq_len=CONTEXT_LENGTH)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

Token indices sequence length is longer than the specified maximum sequence length for this model (2466794 > 512). Running this sequence through the model will result in indexing errors


In [9]:
batch_x, batch_y = next(iter(loader))

In [10]:
embedded = embedding(batch_x)
embedded.shape

torch.Size([4, 128, 768])

In [12]:
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional


vocab_size = 50_000   # number of tokens
embed_dim = 768       # embedding dimension
hidden_dim = 256      # LSTM hidden size
num_layers = 2


class SimpleLSTM(nn.Module):
    def __init__(
            self, 
            vocab_size: int, 
            embed_dim: int, 
            hidden_dim: int, 
            num_layers: int,
            embedding: Optional[nn.Embedding] = None
        ):
        super().__init__()

        if embedding:
            self.embed = embedding
            for param in self.embed.parameters(): # Freeze the embedding layer if its passed
                param.requires_grad = False
        else:
            self.embed = nn.Embedding(vocab_size, embed_dim)

        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # x: [batch, seq_len]
        x = self.embed(x)            # [batch, seq_len, embed_dim]
        out, hidden = self.lstm(x, hidden)  # [batch, seq_len, hidden_dim]
        logits = self.fc(out)        # [batch, seq_len, vocab_size]
        return logits, hidden

In [13]:
lstm = SimpleLSTM(vocab_size, embed_dim, hidden_dim, num_layers, embedding)

In [14]:
sum(p.numel() for p in lstm.parameters() if p.requires_grad)

14426960

In [15]:
def choose_device() -> str:
    if torch.cuda.is_available():
        return "cuda"
    elif torch.backends.mps.is_available():
        return "mps"
    else:
        return "cpu"

In [None]:
import torch
import torch.nn as nn
from tqdm import tqdm

epochs = 10
learning_rate = 1e-3
weight_decay = 1e-2
grad_clip = 1.0
device = torch.device(choose_device())

print(f"Training on device: {device}")

lstm.to(device)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.AdamW(lstm.parameters(), lr=learning_rate, weight_decay=weight_decay)

# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-5)

for epoch in range(1, epochs + 1):
    lstm.train()
    total_loss = 0.0

    progress = tqdm(enumerate(loader), total=len(loader), desc=f"Epoch {epoch}/{epochs}")

    for i, (batch_x, batch_y) in progress:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()
        out, _ = lstm(batch_x)

        # Flatten for CrossEntropyLoss
        loss = criterion(out.view(-1, out.size(-1)), batch_y.view(-1))
        loss.backward()

        # Gradient clipping
        # torch.nn.utils.clip_grad_norm_(lstm.parameters(), grad_clip)

        optimizer.step()

        total_loss += loss.item()
        avg_loss = total_loss / (i + 1)

        progress.set_postfix({"loss": f"{avg_loss:.4f}", "lr": optimizer.param_groups[0]["lr"]})

    # scheduler.step()

    print(f"Epoch {epoch} done | Average Loss: {avg_loss:.4f}")

torch.save(lstm.state_dict(), "lstm_next_token_model.pt")
print("Training complete. Model saved to lstm_next_token_model.pt")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Training on device: mps


Epoch 1/10: 100%|██████████| 4818/4818 [02:10<00:00, 36.79it/s, loss=6.9602, lr=0.001]


Epoch 1 done | Average Loss: 6.9602


Epoch 2/10: 100%|██████████| 4818/4818 [02:10<00:00, 36.94it/s, loss=6.1222, lr=0.001]


Epoch 2 done | Average Loss: 6.1222


Epoch 3/10: 100%|██████████| 4818/4818 [02:10<00:00, 36.96it/s, loss=5.6962, lr=0.001]


Epoch 3 done | Average Loss: 5.6962


Epoch 4/10: 100%|██████████| 4818/4818 [02:09<00:00, 37.07it/s, loss=5.3996, lr=0.001]


Epoch 4 done | Average Loss: 5.3996


Epoch 5/10: 100%|██████████| 4818/4818 [02:09<00:00, 37.16it/s, loss=5.1844, lr=0.001]


Epoch 5 done | Average Loss: 5.1844


Epoch 6/10: 100%|██████████| 4818/4818 [02:11<00:00, 36.63it/s, loss=5.0229, lr=0.001]


Epoch 6 done | Average Loss: 5.0229


Epoch 7/10: 100%|██████████| 4818/4818 [02:11<00:00, 36.74it/s, loss=4.8930, lr=0.001]


Epoch 7 done | Average Loss: 4.8930


Epoch 8/10: 100%|██████████| 4818/4818 [02:10<00:00, 37.02it/s, loss=4.7870, lr=0.001]


Epoch 8 done | Average Loss: 4.7870


Epoch 9/10: 100%|██████████| 4818/4818 [02:10<00:00, 36.97it/s, loss=4.6968, lr=0.001]


Epoch 9 done | Average Loss: 4.6968


Epoch 10/10: 100%|██████████| 4818/4818 [02:12<00:00, 36.46it/s, loss=4.6191, lr=0.001]

Epoch 10 done | Average Loss: 4.6191





In [17]:
def generate_text(model, tokenizer, prompt, max_new_tokens=20, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.to(device)
    model.eval()

    # Encode prompt
    tokens = tokenizer.encode(prompt, add_special_tokens=False)
    input_ids = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)

    generated_tokens = tokens.copy()

    with torch.no_grad():
        hidden = None
        for _ in range(max_new_tokens):
            out, hidden = model(input_ids, hidden)
            last_logits = out[0, -1, :]  # last token
            probs = torch.softmax(last_logits, dim=-1)
            predicted_id = torch.argmax(probs).item()

            # Append predicted token
            generated_tokens.append(predicted_id)
            
            # Prepare next input
            input_ids = torch.tensor([[predicted_id]], dtype=torch.long).to(device)
    
    # Decode full sequence
    text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return text


Generated text: Odrzekł do nie centurion , a ja w tej chwili , gdy w głębi duszy , w którym się znajdował , w którym się znajdował
