In [None]:
import torch
import torch.nn as nn
import math, time, json
import random
import pandas as pd
from torch.utils.data import DataLoader
from rouge_score import rouge_scorer 
from pathlib import Path
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer
from torch.cuda.amp import autocast, GradScaler
import warnings
from src.data_utils import clean_string
from src.next_token_dataset import NextTokenDataset, collate_batch
from src.lstm_model import LSTMLM
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x118e1ecd0>

## Get the data

In [None]:
# !mkdir -p data
# !wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/investigating-sentiment-analysis/data/training.1600000.processed.noemoticon.csv.zip -P data
# !unzip -n -d data data/training.1600000.processed.noemoticon.csv.zip

--2025-09-12 15:01:51--  https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/investigating-sentiment-analysis/data/training.1600000.processed.noemoticon.csv.zip
Resolving nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)... 162.243.189.2
Connecting to nyc3.digitaloceanspaces.com (nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85088192 (81M) [application/zip]
Saving to: ‘data/training.1600000.processed.noemoticon.csv.zip’


2025-09-12 15:02:04 (7.31 MB/s) - ‘data/training.1600000.processed.noemoticon.csv.zip’ saved [85088192/85088192]

Archive:  data/training.1600000.processed.noemoticon.csv.zip
  inflating: data/training.1600000.processed.noemoticon.csv  


### Data Preparation



In [3]:
df = pd.read_csv("./data/training.1600000.processed.noemoticon.csv", names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                encoding='latin-1')
df.head()

Unnamed: 0,polarity,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
df = df.drop(columns=['id', 'date', 'query', 'user', 'polarity'])
df.head()

Unnamed: 0,text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all...."


In [6]:
df["cleaned_text"] = df["text"].apply(clean_string)

In [7]:
df.head()

Unnamed: 0,text,cleaned_text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot httptwitpiccom2y1zl awww thats a bu...
1,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...
2,@Kenichan I dived many times for the ball. Man...,kenichan i dived many times for the ball manag...
3,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all....",nationwideclass no its not behaving at all im ...


In [12]:
df.to_csv("./data/tweets_cleaned.csv")

In [3]:
df = pd.read_csv("./data/tweets_cleaned.csv")

### Data preparation
1. Clean
3. Split
2. Tokenize
4. Create Dataset 
5. Create Dataloader

In [4]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
pad_id = tokenizer.pad_token_id
eos_id = tokenizer.sep_token_id

In [5]:
val_test_size = 0.20
test_size = 0.50

train_texts, val_test_texts = train_test_split(list(df["cleaned_text"]), test_size=val_test_size, random_state=42)
print(f"Train texts: {len(train_texts)}, Val_Test texts: {len(val_test_texts)}")
val_texts, test_texts = train_test_split(val_test_texts, test_size=test_size, random_state=42)
print(f"Val texts: {len(val_texts)}, Test texts: {len(test_texts)}")

Train texts: 1280000, Val_Test texts: 320000
Val texts: 160000, Test texts: 160000


In [6]:
IGNORE_INDEX = -100

In [7]:
device = (
    torch.device("cuda") if torch.cuda.is_available() else
    torch.device("mps")  if torch.backends.mps.is_available() else
    torch.device("cpu")
)
print("device:", device)

device: mps


In [9]:
train_ds = NextTokenDataset(texts=train_texts, tokenizer=tokenizer, eos_id=eos_id, max_len=256)
val_ds = NextTokenDataset(texts=val_test_texts, tokenizer=tokenizer, eos_id=eos_id, max_len=256) 

In [10]:
train_loader = DataLoader(
    train_ds, batch_size=256, shuffle=True,
    collate_fn=collate_batch,
    #num_workers=os.cpu_count()//2 or 2,
    #pin_memory=(device.type=="cuda"),
    #persistent_workers=True
)
val_loader = DataLoader(
    val_ds, batch_size=256, shuffle=False,
    collate_fn=collate_batch,
    #num_workers=os.cpu_count()//2 or 2,
    #pin_memory=(device.type=="cuda"),
    #persistent_workers=True
)

In [11]:
print(f'Количество батчей в train_dataloader: {len(train_loader)}')
print(f'Количество батчей в val_dataloader: {len(val_loader)}')

Количество батчей в train_dataloader: 5000
Количество батчей в val_dataloader: 1250


In [12]:
for batch in train_loader:
    print('input_ids:', batch['input_ids'].shape)
    print('lengths:', batch['lengths'].shape)
    print('labels:', batch['labels'].shape)
    break

input_ids: torch.Size([256, 41])
lengths: torch.Size([256])
labels: torch.Size([256, 41])


In [13]:
model = LSTMLM(
    vocab_size=tokenizer.vocab_size,
    emb_dim=256, hidden_dim=512, num_layers=1
)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3)

scaler = GradScaler(enabled=(device.type=="cuda"))
criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)

In [14]:
def step_train(batch):
    model.train()
    X, Y, L = batch["input_ids"].to(device, non_blocking=True), batch["labels"].to(device, non_blocking=True), batch["lengths"].to(device)
    optimizer.zero_grad(set_to_none=True)
    ctx = autocast(enabled=(device.type=="cuda"))
    with ctx:
        logits = model(X, L) # (B,L,V)
        B,Lm,V = logits.shape
        loss = criterion(logits.view(B*Lm, V), Y.view(B*Lm))
    scaler.scale(loss).backward()
    nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    scaler.step(optimizer)
    scaler.update()
    return loss.item()

In [15]:
@torch.no_grad()
def evaluate_perplexity(val_loader):
    model.eval()
    nll, tokens = 0.0, 0
    for batch in val_loader:
        X, Y, L = batch["input_ids"].to(device), batch["labels"].to(device), batch["lengths"].to(device)
        logits = model(X, L)
        B,Lm,V = logits.shape
        logp = torch.log_softmax(logits, dim=-1)
        # gather gold log-probs where Y != IGNORE_INDEX
        mask = (Y != IGNORE_INDEX)
        gold = Y.clamp_min(0)
        ll   = logp.view(B*Lm, V).gather(1, gold.view(-1,1)).view(B,Lm)
        nll -= (ll * mask).sum().item()
        tokens += mask.sum().item()
    ppl = math.exp(nll / max(tokens,1))
    return ppl

In [16]:
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2"], use_stemmer=False)

@torch.no_grad()
def evaluate_rouge(val_loader, max_gen=50):
    model.eval()
    totals = {"rouge1":0.0, "rouge2":0.0}
    count  = 0
    for batch in val_loader:
        X, Y, L = batch["input_ids"].to(device), batch["labels"].to(device), batch["lengths"]
        # build X: keep first 3/4 of each sequence (by true length, not padded)
        prefixes, refs = [], []
        for i, Li in enumerate(L.tolist()):
            cut = max(1, int(0.75 * Li))
            prefixes.append(X[i, :cut])
            # reference continuation (detokenize for ROUGE)
            ref_ids = X[i, cut:Li]  # gold tail (labels are shifted)
            refs.append(tokenizer.decode(ref_ids.tolist(), skip_special_tokens=True))
        prefix_batch = pad_sequence(prefixes, batch_first=True, padding_value=pad_id).to(device)
        gen_ids = model.generate(prefix_batch, max_new_tokens=max_gen, eos_id=eos_id)
        # take only the newly generated tail
        gens = []
        for i, Li in enumerate(L.tolist()):
            start = prefix_batch.shape[1]
            gens.append(tokenizer.decode(gen_ids[i, start:].tolist(), skip_special_tokens=True))
        for g,r in zip(gens, refs):
            scores = scorer.score(r, g)
            totals["rouge1"] += scores["rouge1"].fmeasure
            totals["rouge2"] += scores["rouge2"].fmeasure
            count += 1
    return {k: v/max(count,1) for k,v in totals.items()}

In [17]:
save_dir = Path("./models") / time.strftime("lstm_lm-%Y%m%d-%H%M%S")
save_dir.mkdir(parents=True, exist_ok=True)

In [None]:
best_ppl = float("inf")

for epoch in range(5):
    running = 0.0
    for batch in train_loader:
        running += step_train(batch)
    train_loss = running / max(len(train_loader),1)
    ppl = evaluate_perplexity(val_loader)
    rouge = evaluate_rouge(val_loader, max_gen=64)
    print(f"epoch {epoch} | train_loss {train_loss:.3f} | val_ppl {ppl:.1f} | rouge1 {rouge['rouge1']:.3f} | rouge2 {rouge['rouge2']:.3f}")

    if ppl < best_ppl:
        best_ppl = ppl
        torch.save({"model_state": model.state_dict(),
                    "config": {"vocab_size": tokenizer.vocab_size, "pad_id": pad_id, "eos_id": eos_id,
                               "emb_dim": model.emb.embedding_dim, "hidden_dim": model.rnn.hidden_size,
                               "num_layers": model.rnn.num_layers, "tied": True}},
                   save_dir / "model.pt")
        tokenizer.save_pretrained(save_dir)
        with open(save_dir / "meta.json", "w") as f:
            json.dump({"best_val_ppl": best_ppl, "epoch": epoch}, f, indent=2)
print("saved to:", str(save_dir))