In [None]:
!pip install matplotlib
!pip install seaborn

In [2]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datasets
import transformers
import torch.nn as nn
import pandas as pd
from peft import LoraConfig
from trl import SFTTrainer
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments

In [None]:
device = 'cpu'
if(torch.cuda.is_available()):
    device = 'cuda'

print(device)

In [None]:
from transformers import BitsAndBytesConfig

base_model_name = "./Llama-3.2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    attn_implementation="eager",          
    output_attentions=True, 
    quantization_config = quant_config,
    device_map={"": 0}
)

In [None]:
train_df = pd.read_csv('Sentiment_Emotion/dataset/sentiment_test.csv')
train_df.head()

In [None]:
train_df = datasets.Dataset.from_dict(train_df)
train_df

In [9]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['Statement'])):       
        text = f"### Question: Classify the sentiment of the following statement into one of the categories: positive or negative. Statement: {example['Reviews'][i]} \n### Answer: {example['Sentiment'][i]}."
        output_texts.append(text)
    return output_texts

In [8]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

base_model.add_adapter(peft_config)

In [None]:
def get_attention_from_output(text: str, top_k=5):
    inputs = tokenizer(text, return_token_type_ids=False, return_tensors="pt").to("cuda")

    with torch.inference_mode():
        outputs = base_model.generate(**inputs, max_new_tokens=1, output_attentions=True, return_dict_in_generate=True)

    generated_token = outputs.sequences[:, inputs.input_ids.shape[1]:] 

    full_input = torch.cat([inputs.input_ids, generated_token], dim=1)  
    full_inputs = {"input_ids": full_input, "attention_mask": torch.ones_like(full_input).to("cuda")}

    with torch.inference_mode():
        model_outputs = base_model(**full_inputs, output_attentions=True)

    attentions = model_outputs.attentions  
    last_layer_attention = attentions[-1][0] 

    output_token_attention = last_layer_attention[:, -1, :-1] 

    avg_attention_scores = output_token_attention.mean(dim=0) 

    top_indices = torch.argsort(avg_attention_scores, descending=True)[:top_k]

    top_tokens = [tokenizer.convert_ids_to_tokens(full_input[0, idx].item()) for idx in top_indices]
    top_tokens = [t.replace("Ġ", " ") for t in top_tokens]
    top_tokens = top_tokens[1:]

    print("Top 5 input tokens with highest attention to the generated token:")
    for token, score in zip(top_tokens, avg_attention_scores[top_indices]):
        print(f"Token: {token}, Attention Score: {score.item():.4f}")

text = f"Can't even log in before the app crashes. Sentiment: Negative"
get_attention_from_output(text)

In [None]:
text = "Can't even log in before the app crashes. Negative"
inputs = tokenizer(text, return_tensors="pt").to(device)

with torch.inference_mode():
    model_outputs = base_model(**inputs)


attentions = model_outputs.attentions
last_layer_attention = attentions[-1][0]

output_token_attention = last_layer_attention[:, -1, :-1]

avg_attention_scores = output_token_attention.mean(dim=0)
top_indices = torch.argsort(avg_attention_scores, descending=True)[:7]
top_tokens = [tokenizer.convert_ids_to_tokens(inputs["input_ids"][0, idx].item()) for idx in top_indices]
top_tokens = [t.replace("Ġ", " ") for t in top_tokens]

for token, score in zip(top_tokens, avg_attention_scores[top_indices]):
    print(f"Token: {token}, Attention Score: {score.item():.4f}")

In [None]:
top_scores = avg_attention_scores[top_indices].cpu().numpy()

plt.figure(figsize=(5, 0.5))
sns.heatmap([top_scores[2:]], annot=True, fmt=".4f", cmap="Blues", xticklabels=top_tokens[2:], yticklabels=["Attention"], cbar=False, linewidths=0)
plt.savefig("top5_labels.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
text = "I have used a great number of apps from different angles of fitness but by far the Centr app stands alone in its variety, positivity and effectiveness. Positive"
inputs = tokenizer(text, return_tensors="pt").to(device)

with torch.inference_mode():
    model_outputs = base_model(**inputs)


attentions = model_outputs.attentions
last_layer_attention = attentions[-1][0]

output_token_attention = last_layer_attention[:, -1, :-1]

avg_attention_scores = output_token_attention.mean(dim=0)
top_indices = torch.argsort(avg_attention_scores, descending=True)[:7]
top_tokens = [tokenizer.convert_ids_to_tokens(inputs["input_ids"][0, idx].item()) for idx in top_indices]
top_tokens = [t.replace("Ġ", " ") for t in top_tokens]
# top_tokens = top_tokens[2:]

for token, score in zip(top_tokens, avg_attention_scores[top_indices]):
    print(f"Token: {token}, Attention Score: {score.item():.4f}")

top_scores = avg_attention_scores[top_indices].cpu().numpy()

plt.figure(figsize=(5, 0.5))
sns.heatmap([top_scores[2:]], annot=True, fmt=".4f", cmap="Blues", xticklabels=top_tokens[2:], yticklabels=["Attention"], cbar=False, linewidths=0)
plt.savefig("top5_labels.pdf", format="pdf", bbox_inches="tight")
plt.show()  