In [None]:
# Cell 1: Install required libraries
!pip install transformers datasets peft torch sentence-transformers evaluate rouge_score matplotlib seaborn

# Import libraries
import torch
import numpy as np
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from evaluate import load
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from datetime import datetime

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Verify GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
try:
    dataset = load_dataset("stanfordnlp/sst2")
    print("Dataset loaded:", dataset)
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise SystemExit("Dataset loading failed. Please check Hugging Face access or try 'rotten_tomatoes' dataset.")

# Verify dataset structure
print("Sample training data:", dataset["train"][0])

In [3]:
# Create synthetic ground-truth proposals (simplified for demo)
def create_synthetic_proposal(sentence, label):
    sentiment = "positive" if label == 1 else "negative"
    problem = f"Analyze sentiment in short texts like: '{sentence}'"
    hypothesis = f"The {sentiment} sentiment can be detected using contextual embeddings."
    methodology = "Use a transformer-based model with fine-tuning on short-text datasets."
    return f"Problem: {problem}\nHypothesis: {hypothesis}\nMethodology: {methodology}"

# Prepare dataset with input-output pairs (fixed for batched processing)
def preprocess_data(examples):
    inputs = [f"Propose a method for sentiment analysis of ambiguous short texts: {sentence}" for sentence in examples["sentence"]]
    targets = [create_synthetic_proposal(sentence, label) for sentence, label in zip(examples["sentence"], examples["label"])]
    return {"input_text": inputs, "target_text": targets}

In [None]:
try:
    train_data = dataset["train"].map(preprocess_data, batched=True, remove_columns=["sentence", "label", "idx"])
    validation_data = dataset["validation"].map(preprocess_data, batched=True, remove_columns=["sentence", "label", "idx"])
except Exception as e:
    print(f"Error in preprocessing: {e}")
    raise SystemExit("Preprocessing failed. Check dataset structure.")

# Create train-test-validation split from training data
train_val_data = train_data.train_test_split(test_size=0.2, seed=42)  # 80% train, 20% temp
val_test_data = train_val_data["test"].train_test_split(test_size=0.5, seed=42)  # Split temp into 10% val, 10% test

train_dataset = train_val_data["train"]
val_dataset = val_test_data["train"]
test_dataset = val_test_data["test"]

print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}, Test size: {len(test_dataset)}")

In [None]:
model_name = "google/flan-t5-small"
try:
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
except Exception as e:
    print(f"Error loading model/tokenizer: {e}")
    raise SystemExit("Model loading failed. Check Hugging Face access.")

# Apply LoRA for efficient fine-tuning
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
def tokenize_function(examples):
    inputs = tokenizer(examples["input_text"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["target_text"], max_length=256, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["input_text", "target_text"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["input_text", "target_text"])
test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["input_text", "target_text"])

In [None]:
from transformers import Adafactor
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
import gc
import matplotlib.pyplot as plt
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)


optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)

EPOCHS = 5
train_losses = []
val_losses = []

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    print(f"\n🔁 Epoch {epoch+1}/{EPOCHS}")

    for batch in tqdm(train_loader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

        del input_ids, attention_mask, labels, outputs, loss, batch
        gc.collect()

    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f"✅ Avg Training Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            del input_ids, attention_mask, labels, outputs, batch
            gc.collect()

    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    print(f"📉 Avg Validation Loss: {avg_val_loss:.4f}")

# Plot loss curve
plt.plot(train_losses, label="Training Loss", marker='o')
plt.plot(val_losses, label="Validation Loss", marker='x')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from evaluate import load as load_metric
from sentence_transformers import util, SentenceTransformer
import numpy as np

# Metrics
rouge = load_metric("rouge")
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

test_loader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)
model.eval()
generated, references = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"]

        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=32)
        decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        generated.extend(decoded_preds)
        references.extend(decoded_labels)

        del input_ids, attention_mask, labels, generated_ids, decoded_preds, decoded_labels, batch
        gc.collect()

# Compute metrics
rouge_scores = rouge.compute(predictions=generated, references=references)
print("🧪 Final Test ROUGE-L:", rouge_scores["rougeL"])

sampled_preds = generated[:min(100, len(generated))]
embeddings = sentence_model.encode(sampled_preds, convert_to_tensor=True, batch_size=16, show_progress_bar=False)
cosine_scores = util.cos_sim(embeddings, embeddings).mean().item()
novelty = 1 - cosine_scores

relevance = sum(1 for pred in generated if "Problem:" in pred and "Hypothesis:" in pred and "Methodology:" in pred) / len(generated)

print(f"🧠 Novelty Score: {novelty:.4f}")
print(f"✅ Relevance Score: {relevance:.4f}")
