In [None]:
!pip install transformers datasets sentence-transformers faiss-cpu scikit-learn pandas numpy nbformat

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import load_dataset
dataset = load_dataset("tweet_eval", "sentiment")
print(dataset['train'].features['label'])

In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128)

train_ds = dataset['train'].map(preprocess, batched=True)
val_ds = dataset['validation'].map(preprocess, batched=True)
test_ds = dataset['test'].map(preprocess, batched=True)
train_ds = train_ds.rename_column("label", "labels")
val_ds = val_ds.rename_column("label", "labels")
test_ds = test_ds.rename_column("label", "labels")
train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall":recall, "f1": f1}

training_args = TrainingArguments(output_dir="/content/drive/MyDrive/sentiment_model", eval_strategy="epoch", save_strategy="epoch", learning_rate=2e-5,
                                  per_device_train_batch_size=16, per_device_eval_batch_size=64, num_train_epochs=3, weight_decay=0.01, load_best_model_at_end=True,
                                  logging_dir="/content/drive/MyDrive/logs")
trainer = Trainer(model=model,args=training_args, train_dataset=train_ds, eval_dataset=val_ds, tokenizer=tokenizer, compute_metrics=compute_metrics)
trainer.train()

In [None]:
trainer.evaluate(test_ds)

In [None]:
trainer.save_model("/content/drive/MyDrive/sentiment_model")
tokenizer.save_pretrained("/content/drive/MyDrive/sentiment_model")
model = model.to("cpu")
model.save_pretrained("/content/drive/MyDrive/sentiment_model_cpu")
tokenizer.save_pretrained("/content/drive/MyDrive/sentiment_model_cpu")

In [None]:
from transformers import pipeline

clf = pipeline("text-classification", model="/content/drive/MyDrive/sentiment_model", tokenizer="/content/drive/MyDrive/sentiment_model")
print(clf("I love this airline, great service!"))

In [None]:
import faiss, pickle
from sentence_transformers import SentenceTransformer

train_texts = dataset['train']['text']
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(train_texts, show_progress_bar=True, convert_to_numpy=True)
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)
faiss.write_index(index, "faiss_index.index")
train_texts = list(dataset['train']['text'])
with open("passages.pkl", "wb") as f:
    pickle.dump(train_texts, f)

In [None]:
def retrieve_passages(query, k=5):
    q_emb = embedder.encode([query])
    D, I = index.search(q_emb, k)
    return [train_texts[int(i)] for i in I[0]]

In [None]:
print(retrieve_passages("The flight was terrible", k=3))

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

GEN_MODEL = "google/flan-t5-small"
gen_tok = AutoTokenizer.from_pretrained(GEN_MODEL)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL)

In [None]:
def explain_sentiment(text, predicted_label, k=3):
    retrieved = retrieve_passages(text, k=k)
    prompt = f"""
    Input: "{text}"
    Predicted sentiment: {predicted_label}
    Retrieved examples:
    - {"\n- ".join(retrieved)}
    Explain in 1-2 sentences why the prediction makes sense based on the retrieved evidence.
    """
    inputs = gen_tok(prompt, return_tensors="pt", truncation=True, max_length=512)
    outputs = gen_model.generate(**inputs, max_length=80)
    explanation = gen_tok.decode(outputs[0], skip_special_tokens=True)
    return {"retrieved": retrieved, "explanation": explanation}

In [None]:
sample = "The seats were uncomfortable and the service was rude."
pred = clf(sample)[0]['label']
rag_output = explain_sentiment(sample, pred)
print("Prediction:", pred)
print("Retrieved passages:", rag_output['retrieved'])
print("Explanation:", rag_output['explanation'])