In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
kingburrito666_shakespeare_plays_path = kagglehub.dataset_download('kingburrito666/shakespeare-plays')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/kingburrito666/shakespeare-plays?dataset_version_number=4...


100%|██████████| 4.55M/4.55M [00:00<00:00, 194MB/s]

Extracting files...
Data source import complete.





In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
# --------------------------------------------------------------
# 1. IMPORTS
# --------------------------------------------------------------
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

# --------------------------------------------------------------
# 2. LOAD & PRE-PROCESS TEXT (use only first 500k chars)
# --------------------------------------------------------------
# Use the path returned by kagglehub.dataset_download in the first cell
data_dir = kingburrito666_shakespeare_plays_path
text = ''
for fn in os.listdir(data_dir):
    if fn.endswith('.txt'):
        with open(os.path.join(data_dir, fn), 'r', encoding='utf-8') as f:
            text += f.read() + '\n'

# ---- TAKE SUBSET ------------------------------------------------
text = text[:500_000]                     # <<< fast demo
print(f"Using {len(text):,} characters")

chars = sorted(list(set(text)))
char_to_ix = {c: i for i, c in enumerate(chars)}
ix_to_char = {i: c for i, c in enumerate(chars)}
vocab_size = len(chars)
print(f"Vocab size: {vocab_size}")

def text_to_tensor(s):
    return torch.tensor([char_to_ix[c] for c in s], dtype=torch.long)

data = text_to_tensor(text)

# --------------------------------------------------------------
# 3. DATASET (seq_len = 50)
# --------------------------------------------------------------
class CharDataset(Dataset):
    def __init__(self, data, seq_len=50):
        self.data = data
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, i):
        return (self.data[i:i+self.seq_len],
                self.data[i+1:i+self.seq_len+1])

seq_len = 50
dataset = CharDataset(data, seq_len)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True, drop_last=True)

# --------------------------------------------------------------
# 4. LSTM MODEL
# --------------------------------------------------------------
class CharLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=256, layers=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm  = nn.LSTM(embed_dim, hidden_dim, layers,
                             dropout=0.3, batch_first=True)
        self.fc    = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embed(x)                     # (B, L, E)
        out, hidden = self.lstm(x, hidden)    # (B, L, H)
        out = out.contiguous().view(-1, out.size(-1))
        out = self.fc(out)                    # (B*L, V)
        return out, hidden

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CharLSTM(vocab_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# --------------------------------------------------------------
# 5. TRAINING LOOP (5 epochs, progress bar, grad clipping)
# --------------------------------------------------------------
def train(epochs=5):
    model.train()
    for ep in range(1, epochs+1):
        epoch_loss = 0.0
        pbar = tqdm(dataloader, desc=f'Epoch {ep}/{epochs}')
        for xb, yb in pbar:
            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad()
            logits, _ = model(xb)
            loss = criterion(logits, yb.view(-1))
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            epoch_loss += loss.item()
            pbar.set_postfix(loss=loss.item())

        avg = epoch_loss / len(dataloader)
        print(f"\n>>> Epoch {ep} finished – Avg loss: {avg:.4f}\n")

train(epochs=5)

# --------------------------------------------------------------
# 6. SAVE MODEL
# --------------------------------------------------------------
torch.save(model.state_dict(), 'shakespeare_lstm_fast.pth')
print("Model saved → shakespeare_lstm_fast.pth")

# --------------------------------------------------------------
# 7. TEXT GENERATION (temperature sampling)
# --------------------------------------------------------------
def generate(seed: str, length: int = 300, temp: float = 0.8):
    model.eval()
    generated = list(seed)
    # pad seed to seq_len if needed
    if len(generated) < seq_len:
        generated = [' '] * (seq_len - len(generated)) + generated
    inp = torch.tensor([char_to_ix.get(c, 0) for c in generated[-seq_len:]],
                       dtype=torch.long).unsqueeze(0).to(device)

    hidden = None
    with torch.no_grad():
        for _ in range(length):
            logits, hidden = model(inp, hidden)
            probs = torch.softmax(logits[-1] / temp, dim=-1)
            nxt = torch.multinomial(probs, num_samples=1).item()
            generated.append(ix_to_char[nxt])
            # slide window
            inp = torch.cat([inp[:, 1:], torch.tensor([[nxt]], device=device)], dim=1)

    return ''.join(generated[-length:])   # return only the newly generated part

# --------------------------------------------------------------
# 8. TRY IT
# --------------------------------------------------------------
seed_text = "To be, or not to be"
print("\n--- GENERATED TEXT ---")
print(generate(seed_text, length=400, temp=0.8))

Using 500,000 characters
Vocab size: 70


Epoch 1/5: 100%|██████████| 3905/3905 [01:04<00:00, 60.34it/s, loss=1.28]



>>> Epoch 1 finished – Avg loss: 1.5354



Epoch 2/5: 100%|██████████| 3905/3905 [01:03<00:00, 61.26it/s, loss=1.13]



>>> Epoch 2 finished – Avg loss: 1.2129



Epoch 3/5: 100%|██████████| 3905/3905 [01:03<00:00, 61.31it/s, loss=1.09]



>>> Epoch 3 finished – Avg loss: 1.1228



Epoch 4/5: 100%|██████████| 3905/3905 [01:03<00:00, 61.05it/s, loss=1.07]



>>> Epoch 4 finished – Avg loss: 1.0730



Epoch 5/5: 100%|██████████| 3905/3905 [01:04<00:00, 61.00it/s, loss=1.05]



>>> Epoch 5 finished – Avg loss: 1.0401

Model saved → shakespeare_lstm_fast.pth

--- GENERATED TEXT ---
hold"
"which three and digg'd you had he break in their house."
"An if thou particular unto his soul."
"And God forbid a silly steel at once,"
"The Lord Scot graceously in France,"
"And now no ear by was the truth, and tell the king."
"Therefore, to brave herreat: possession and thy"
"brother of commonwealth,"
"How shall be commanded with a scorron sweet, but instandared,"
"That we to take my lord
