In [1]:
import gc, random
from typing import List, Tuple
from sklearn.model_selection import train_test_split
import torch
from torch.nn.utils.rnn import pad_sequence
from rouge_score import rouge_scorer
from src.data_utils import clean_string, split_x_target_by_words
from src.lstm_model import load_lstm_lm
from transformers import AutoTokenizer, AutoModelForCausalLM

import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("./data/tweets_cleaned.csv")
val_test_size = 0.20
test_size = 0.50

train_texts, val_test_texts = train_test_split(list(df["cleaned_text"]), test_size=val_test_size, random_state=42)
print(f"Train texts: {len(train_texts)}, Val_Test texts: {len(val_test_texts)}")
val_texts, test_texts = train_test_split(val_test_texts, test_size=test_size, random_state=42)
print(f"Val texts: {len(val_texts)}, Test texts: {len(test_texts)}")

Train texts: 1280000, Val_Test texts: 320000
Val texts: 160000, Test texts: 160000


In [None]:
device = (
    torch.device("cuda") if torch.cuda.is_available() else
    torch.device("mps")  if torch.backends.mps.is_available() else
    torch.device("cpu")
)
print("device:", device)


def build_eval_pairs(texts, max_examples: int = 1500):
    pairs = []
    for t in texts:
        t = clean_string(t)
        if not t:
            continue
        x, y = split_x_target_by_words(t)
        if x and y:
            pairs.append((x, y))
        if len(pairs) >= max_examples:
            break
    X = [p for p,_ in pairs]
    Y = [r for _,r in pairs]
    print(f"Built {len(X)} prefix/target pairs.")
    return X, Y

device: mps


In [4]:
@torch.inference_mode()
def generate_lstm(prefixes: List[str], tokenizer, model, pad_id, eos_id, device, max_new_tokens=64):
    ids = [torch.tensor(tokenizer.encode(p, add_special_tokens=False), dtype=torch.long) for p in prefixes]
    if not ids:
        return []
    batch = pad_sequence(ids, batch_first=True, padding_value=pad_id).to(device)
    out_ids = model.generate(batch, max_new_tokens=max_new_tokens, eos_id=eos_id)
    start = batch.size(1)  # right padded
    return [tokenizer.decode(out_ids[i, start:].tolist(), skip_special_tokens=True)
            for i in range(out_ids.size(0))]

In [5]:

gpt_name = "distilbert/distilgpt2"
gpt_tok = AutoTokenizer.from_pretrained(gpt_name, padding_side="left")  # left padding for decoder
if gpt_tok.pad_token_id is None:
    gpt_tok.pad_token = gpt_tok.eos_token
gpt = AutoModelForCausalLM.from_pretrained(
    gpt_name,
    torch_dtype=(torch.float16 if device.type in {"cuda","mps"} else None),
    low_cpu_mem_usage=True
).to(device).eval()
gpt.config.pad_token_id = gpt_tok.pad_token_id


In [6]:

@torch.inference_mode()
def generate_gpt2(prefixes: List[str], max_new_tokens=64, do_sample=True, top_k=50, top_p=0.95, temperature=0.8, repetition_penalty=1.1):
    enc = gpt_tok(prefixes, return_tensors="pt", padding=True, truncation=True)
    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)
    kwargs = dict(
        max_new_tokens=max_new_tokens,
        eos_token_id=gpt_tok.eos_token_id,
        pad_token_id=gpt_tok.pad_token_id,
        use_cache=True,
        do_sample=do_sample,
        top_k=top_k, 
        top_p=top_p, 
        temperature=temperature, 
        repetition_penalty=repetition_penalty
    )
    out = gpt.generate(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
    lens = attention_mask.sum(1)  # left padded
    return [gpt_tok.decode(out[i, int(L.item()):], skip_special_tokens=True) for i, L in enumerate(lens)]


In [7]:
scorer = rouge_scorer.RougeScorer(["rouge1","rouge2"], use_stemmer=False)

def eval_rouge_batched(prefixes, references, gen_fn, bs=16):
    r1 = r2 = n = 0
    for s in range(0, len(prefixes), bs):
        preds = gen_fn(prefixes[s:s+bs])
        for p, g in zip(preds, references[s:s+bs]):
            sc = scorer.score(g, p)
            r1 += sc["rouge1"].fmeasure
            r2 += sc["rouge2"].fmeasure
            n  += 1
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
    return {"rouge1": r1/max(n,1), "rouge2": r2/max(n,1)}


In [8]:
test_prefixes, test_refs = build_eval_pairs(test_texts, max_examples=1500)

run_dir = "models/lstm_lm-20250912-181525" 
tok_lstm, lstm_model, pad_id, eos_id = load_lstm_lm(run_dir, device)

Built 1500 prefix/target pairs.


In [9]:
lstm_gen = lambda batch: generate_lstm(batch, tokenizer=tok_lstm, model=lstm_model,
                                       pad_id=pad_id, eos_id=eos_id, device=device, max_new_tokens=64)
best_cfg = dict(do_sample=True, top_k=50, top_p=0.95, temperature=0.8, repetition_penalty=1.1)
gpt_gen  = lambda batch: generate_gpt2(batch, max_new_tokens=64, **best_cfg)

print("\n=== TEST COMPARISON (3/4 → 1/4, word-based) ===")
lstm_scores = eval_rouge_batched(test_prefixes, test_refs, lstm_gen, bs=16)
print("LSTM  : ROUGE1={:.3f} ROUGE2={:.3f}".format(lstm_scores["rouge1"], lstm_scores["rouge2"]))

gpt_scores  = eval_rouge_batched(test_prefixes, test_refs, gpt_gen, bs=16)
print("GPT-2 : ROUGE1={:.3f} ROUGE2={:.3f}".format(gpt_scores["rouge1"], gpt_scores["rouge2"]))


for i in random.sample(range(len(test_prefixes)), k=min(5, len(test_prefixes))):
    lp = lstm_gen([test_prefixes[i]])[0]
    gp = gpt_gen([test_prefixes[i]])[0]
    print(f"\n— Example {i} —")
    print("PREFIX   :", test_prefixes[i])
    print("TARGET   :", test_refs[i])
    print("LSTM     :", lp)
    print("DistilG2 :", gp)



=== TEST COMPARISON (3/4 → 1/4, word-based) ===
LSTM  : ROUGE1=0.043 ROUGE2=0.001
GPT-2 : ROUGE1=0.028 ROUGE2=0.001

— Example 163 —
PREFIX   : charlimon haha your mint come on msn if you can
TARGET   : we need a natter
LSTM     : ##t
DistilG2 :  afford it for a little bit and be honest, i am so very excited to see my new car.
This is the first example of how I will produce something that looks great in comparison with anything from their classic Mercedes E-Class model or just like any other brand which has some kind of factory build quality but lacks

— Example 1160 —
PREFIX   : zarigee i got mcdonalds surpisingly its not a popular drunk food option its sucks that
TARGET   : all diners arent 24hrs here anymore
LSTM     : ##s the best
DistilG2 :  it just wont be the first place in town. if you want to get some extra beer, no problem with this and dont even have an entire bar at home because there are only 10 people who can drink them..

Also what about all of these places? What do I m