In [1]:
import torch
import torch.nn as nn
import re
import math, os, time, json
import random
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from rouge_score import rouge_scorer 
from pathlib import Path
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from transformers import AutoTokenizer
from torch.cuda.amp import autocast, GradScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f76318e4c70>

## Get the data

In [3]:
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/investigating-sentiment-analysis/data/training.1600000.processed.noemoticon.csv.zip -P data
!unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

--2025-09-12 18:13:39--  https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/investigating-sentiment-analysis/data/training.1600000.processed.noemoticon.csv.zip
Resolving nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)... 162.243.189.2
Connecting to nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85088192 (81M) [application/zip]
Saving to: ‘data/training.1600000.processed.noemoticon.csv.zip’


2025-09-12 18:13:44 (19.8 MB/s) - ‘data/training.1600000.processed.noemoticon.csv.zip’ saved [85088192/85088192]

Archive:  data/training.1600000.processed.noemoticon.csv.zip
  inflating: data/training.1600000.processed.noemoticon.csv  


### Data Preparation
1. Очистка и нормализация текста
- Привести к нижнему регистру;
- удалить ссылки, упоминания, эмодзи (по необходимости);
- заменить нестандартные символы;
- токенизировать текст.

3. Формирование обучающих примеров (X → Y)
По токенизированному датасету собираем примеры для обучения. 
Обрабатывая каждый токен, надо учиться предсказывать следующий. Поэтому таргет будет смещён на 1 токен вправо относительно исходной последовательности.
Пример: исходный текст "я собираюсь купить продукты".
X: ["я", "собираюсь", "купить"] → Y: ["собираюсь", "купить", "продукты"]
4. Разделение на трейн, валидацию и тест
Разбиваем датасет на обучающую, валидационную и тестовую выборки:
трейн: 80%,
валидация: 10%,
тест: 10%.


In [4]:
df = pd.read_csv("./data/training.1600000.processed.noemoticon.csv", names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                encoding='latin-1')
df.head()

Unnamed: 0,polarity,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df = df.drop(columns=['id', 'date', 'query', 'user', 'polarity'])
df.head()

Unnamed: 0,text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all...."


In [6]:
def clean_string(text):
    # приведение к нижнему регистру
    text = text.lower()
    # удаление всего, кроме латинских букв, цифр и пробелов
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # удаление дублирующихся пробелов, удаление пробелов по краям
    text = re.sub(r'\s+', ' ', text).strip()    
    return text

In [7]:
df["cleaned_text"] = df["text"].apply(clean_string)

In [8]:
df.head()

Unnamed: 0,text,cleaned_text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot httptwitpiccom2y1zl awww thats a bu...
1,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...
2,@Kenichan I dived many times for the ball. Man...,kenichan i dived many times for the ball manag...
3,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all....",nationwideclass no its not behaving at all im ...


In [9]:
df.to_csv("./data/tweets_cleaned.csv")

In [3]:
df = pd.read_csv("./data/tweets_cleaned.csv")

### Data preparation

In [10]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
pad_id = tokenizer.pad_token_id       
eos_id = tokenizer.sep_token_id  

In [11]:
val_test_size = 0.20
test_size = 0.50

train_texts, val_test_texts = train_test_split(list(df["cleaned_text"]), test_size=val_test_size, random_state=42)
print(f"Train texts: {len(train_texts)}, Val_Test texts: {len(val_test_texts)}")
val_texts, test_texts = train_test_split(val_test_texts, test_size=test_size, random_state=42)
print(f"Val texts: {len(val_texts)}, Test texts: {len(test_texts)}")

Train texts: 1280000, Val_Test texts: 320000
Val texts: 160000, Test texts: 160000


In [12]:
IGNORE_INDEX = -100  # совместимо с CrossEntropyLoss(ignore_index)

class NextTokenDataset(Dataset):
    def __init__(self, texts, max_len=None):
        self.texts = texts
        self.max_len = max_len

    def __len__(self): 
        return len(self.texts)
    
    def __getitem__(self, idx):
        ids = tokenizer.encode(self.texts[idx],
                               add_special_tokens=False,
                               truncation=True, max_length=self.max_len)
        
        if not ids or ids[-1] != eos_id:
            ids = ids + [eos_id]
        x = torch.tensor(ids[:-1], dtype=torch.long)
        y = torch.tensor(ids[1:],  dtype=torch.long)
        return {"input_ids": x, "labels": y}

def collate_batch(batch, train_last_quarter=True):
    xs  = [b["input_ids"] for b in batch]
    ys  = [b["labels"]    for b in batch]
    lengths = torch.tensor([len(x) for x in xs], dtype=torch.long)

    X = pad_sequence(xs, batch_first=True, padding_value=pad_id)
    Y = pad_sequence(ys, batch_first=True, padding_value=IGNORE_INDEX)

    if train_last_quarter:
        for i, L in enumerate(lengths.tolist()):
            cut = int(0.75 * L)
            if cut > 0:
                Y[i, :cut] = IGNORE_INDEX
    return {"input_ids": X, "labels": Y, "lengths": lengths}

In [13]:
device = (
    torch.device("cuda") if torch.cuda.is_available() else
    torch.device("mps")  if torch.backends.mps.is_available() else
    torch.device("cpu")
)
print("device:", device)

device: cuda


In [14]:

train_ds = NextTokenDataset(train_texts, max_len=256)
val_ds = NextTokenDataset(val_test_texts, max_len=256) 

train_loader = DataLoader(
    train_ds, batch_size=256, shuffle=True,
    collate_fn=collate_batch,
    #num_workers=os.cpu_count()//2 or 2,
    #pin_memory=(device.type=="cuda"),
    #persistent_workers=True
)
val_loader = DataLoader(
    val_ds, batch_size=256, shuffle=False,
    collate_fn=collate_batch,
    #num_workers=os.cpu_count()//2 or 2,
    #pin_memory=(device.type=="cuda"),
    #persistent_workers=True
)

In [15]:
print(f'Количество батчей в train_dataloader: {len(train_loader)}')
print(f'Количество батчей в val_dataloader: {len(val_loader)}')

Количество батчей в train_dataloader: 5000
Количество батчей в val_dataloader: 1250


In [16]:
for batch in train_loader:
    print('input_ids:', batch['input_ids'].shape)
    print('lengths:', batch['lengths'].shape)
    print('labels:', batch['labels'].shape)
    break

input_ids: torch.Size([256, 41])
lengths: torch.Size([256])
labels: torch.Size([256, 41])


In [17]:
class LSTMLM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers=1, pad_id=0):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_id)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.head = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_ids, lengths):
        # 1) Embed padded batch
        x = self.emb(input_ids)  # B×L×E

        # 2) Pack so RNN ignores padding
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)

        # 3) Run LSTM on packed input
        packed_out, _ = self.rnn(packed)  # still packed

        # 4) Unpack back to padded (B×L×H) to align with padded labels
        out_padded, _ = pad_packed_sequence(packed_out, batch_first=True)

        # 5) Project to vocab
        logits = self.head(out_padded)  # B×L×V
        return logits

    @torch.no_grad()
    def generate(self, prefix_ids: torch.Tensor, max_new_tokens: int, eos_id: int):
        # prefix_ids: (B, L0) padded with pad_id on right (we’ll step only from actual last token)
        self.eval()
        B, L0 = prefix_ids.shape
        # Warm-up hidden state by running the full prefix through the LSTM
        lengths = (prefix_ids != self.emb.padding_idx).sum(dim=1)
        x = self.emb(prefix_ids)
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, (h, c) = self.rnn(packed)  # get final hidden state for each seq
        # Now generate step-by-step
        seq = prefix_ids.clone()
        for _ in range(max_new_tokens):
            # Take last non-pad token for each sequence:
            last_tokens = []
            for i in range(B):
                li = lengths[i].item()
                last_tokens.append(seq[i, li-1:li])
            last_tokens = torch.vstack(last_tokens)  # B×1
            x1 = self.emb(last_tokens)              # B×1×E
            out, (h, c) = self.rnn(x1, (h, c))      # B×1×H
            logits = self.head(out)                 # B×1×V
            next_id = logits.argmax(dim=-1)         # B×1
            # append
            seq = torch.cat([seq, next_id], dim=1)
            lengths = lengths + (next_id != self.emb.padding_idx).squeeze(1).long()
            # early stop if all hit EOS
            if (next_id.squeeze(1) == eos_id).all():
                break
        return seq


In [18]:
model = LSTMLM(
    vocab_size=tokenizer.vocab_size,
    emb_dim=256, hidden_dim=512, num_layers=1, pad_id=pad_id
)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3)

scaler = GradScaler(enabled=(device.type=="cuda"))
criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  scaler = GradScaler(enabled=(device.type=="cuda"))


In [19]:
def step_train(batch):
    model.train()
    X, Y, L = batch["input_ids"].to(device, non_blocking=True), batch["labels"].to(device, non_blocking=True), batch["lengths"].to(device)
    optimizer.zero_grad(set_to_none=True)
    ctx = autocast(enabled=(device.type=="cuda"))
    with ctx:
        logits = model(X, L)                  # (B,L,V)
        B,Lm,V = logits.shape
        loss = criterion(logits.view(B*Lm, V), Y.view(B*Lm))
    scaler.scale(loss).backward()
    nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    scaler.step(optimizer)
    scaler.update()
    return loss.item()

In [20]:
@torch.no_grad()
def evaluate_perplexity(val_loader):
    model.eval()
    nll, tokens = 0.0, 0
    for batch in val_loader:
        X, Y, L = batch["input_ids"].to(device), batch["labels"].to(device), batch["lengths"].to(device)
        logits = model(X, L)
        B,Lm,V = logits.shape
        logp = torch.log_softmax(logits, dim=-1)
        # gather gold log-probs where Y != IGNORE_INDEX
        mask = (Y != IGNORE_INDEX)
        gold = Y.clamp_min(0)
        ll   = logp.view(B*Lm, V).gather(1, gold.view(-1,1)).view(B,Lm)
        nll -= (ll * mask).sum().item()
        tokens += mask.sum().item()
    ppl = math.exp(nll / max(tokens,1))
    return ppl  # Perplexity (exp of average NLL).  # :contentReference[oaicite:10]{index=10}


In [21]:
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2"], use_stemmer=False)

@torch.no_grad()
def evaluate_rouge(val_loader, max_gen=50):
    model.eval()
    totals = {"rouge1":0.0, "rouge2":0.0}
    count  = 0
    for batch in val_loader:
        X, Y, L = batch["input_ids"].to(device), batch["labels"].to(device), batch["lengths"]
        # build prefix: keep first 3/4 of each sequence (by true length, not padded)
        prefixes, refs = [], []
        for i, Li in enumerate(L.tolist()):
            cut = max(1, int(0.75 * Li))
            prefixes.append(X[i, :cut])
            # reference continuation (detokenize for ROUGE)
            ref_ids = X[i, cut:Li]  # gold tail (labels are shifted)
            refs.append(tokenizer.decode(ref_ids.tolist(), skip_special_tokens=True))
        prefix_batch = pad_sequence(prefixes, batch_first=True, padding_value=pad_id).to(device)
        gen_ids = model.generate(prefix_batch, max_new_tokens=max_gen, eos_id=eos_id)
        # take only the newly generated tail
        gens = []
        for i, Li in enumerate(L.tolist()):
            start = prefix_batch.shape[1]
            gens.append(tokenizer.decode(gen_ids[i, start:].tolist(), skip_special_tokens=True))
        for g,r in zip(gens, refs):
            scores = scorer.score(r, g)
            totals["rouge1"] += scores["rouge1"].fmeasure
            totals["rouge2"] += scores["rouge2"].fmeasure
            count += 1
    return {k: v/max(count,1) for k,v in totals.items()}

In [22]:

# --- Training loop with validation, checkpoints ---
save_dir = Path("./models") / time.strftime("lstm_lm-%Y%m%d-%H%M%S")
save_dir.mkdir(parents=True, exist_ok=True)


In [23]:

best_ppl = float("inf")
for epoch in range(5):
    running = 0.0
    for batch in train_loader:
        running += step_train(batch)
    train_loss = running / max(len(train_loader),1)
    ppl = evaluate_perplexity(val_loader)
    rouge = evaluate_rouge(val_loader, max_gen=64)
    print(f"epoch {epoch} | train_loss {train_loss:.3f} | val_ppl {ppl:.1f} | rouge1 {rouge['rouge1']:.3f} | rouge2 {rouge['rouge2']:.3f}")

    # save best by perplexity
    if ppl < best_ppl:
        best_ppl = ppl
        torch.save({"model_state": model.state_dict(),
                    "config": {"vocab_size": tokenizer.vocab_size, "pad_id": pad_id, "eos_id": eos_id,
                               "emb_dim": model.emb.embedding_dim, "hidden_dim": model.rnn.hidden_size,
                               "num_layers": model.rnn.num_layers, "tied": True}},
                   save_dir / "model.pt")
        tokenizer.save_pretrained(save_dir)  # saves tokenizer files alongside
        with open(save_dir / "meta.json", "w") as f:
            json.dump({"best_val_ppl": best_ppl, "epoch": epoch}, f, indent=2)
print("saved to:", str(save_dir))

  ctx = autocast(enabled=(device.type=="cuda"))


epoch 0 | train_loss 4.954 | val_ppl 107.2 | rouge1 0.037 | rouge2 0.001
epoch 1 | train_loss 4.587 | val_ppl 101.0 | rouge1 0.029 | rouge2 0.001
epoch 2 | train_loss 4.560 | val_ppl 101.4 | rouge1 0.032 | rouge2 0.001
epoch 3 | train_loss 4.569 | val_ppl 101.9 | rouge1 0.035 | rouge2 0.001
epoch 4 | train_loss 4.568 | val_ppl 101.8 | rouge1 0.036 | rouge2 0.001
saved to: models/lstm_lm-20250912-181525


# Previous

In [None]:

# If you didn't keep train_step, here it is (packing handled inside model.forward):
def train_step(model, batch, optimizer):
    logits = model(batch["input_ids"], batch["lengths"])  # B×L×V
    B, L, V = logits.shape
    loss = criterion(logits.view(B*L, V), batch["labels"].view(B*L))
    optimizer.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), 1.0)     # optional but helpful
    optimizer.step()
    return loss.item()


In [32]:

# ==== Tiny training loop ====
for epoch in range(5):
    model.train()
    total = 0.0
    for batch in train_loader:
        total += train_step(model, batch, optimizer)
    print(f"epoch {epoch} | train_loss {total/len(train_loader):.3f}")


KeyboardInterrupt: 

In [None]:

# ==== Optional: quick generation sanity check ====
# Build a prefix batch (right-padded with pad_id)
prefix = ["i wish", "good morning"]
enc = [tokenizer.encode(p, add_special_tokens=False) for p in prefix]
enc = [torch.tensor(e, dtype=torch.long) for e in enc]
prefix_batch = pad_sequence(enc, batch_first=True, padding_value=pad_id)

with torch.no_grad():
    out_ids = model.generate(prefix_batch, max_new_tokens=10, eos_id=eos_id)
decoded = [tokenizer.decode(row.tolist(), skip_special_tokens=True) for row in out_ids]
print(decoded)

In [20]:

# ---------- Training step ----------

criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)  # padding & masked quarters ignored

def train_step(model, batch, optimizer):
    logits = model(batch["input_ids"], batch["lengths"])      # B×L×V
    B, L, V = logits.shape
    loss = criterion(logits.view(B*L, V), batch["labels"].view(B*L))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()