In [None]:
!pip install datasets transformers

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer

# Load SST-2 dataset
dataset = load_dataset("glue", "sst2")

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

# Tokenize and format the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets.set_format("torch")

# Save the tokenized dataset
#tokenized_datasets.save_to_disk("sst2_tokenized")

In [None]:
from torch.utils.data import DataLoader

# Split the tokenized dataset into train and validation sets
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]

# Define DataLoaders for batching
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

In [None]:
from transformers import BertForSequenceClassification

# Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",  # Pretrained model name
    num_labels=2          # Number of output labels (positive/negative)
)

# Move model to GPU if available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Model loaded on: {device}")

In [None]:
from transformers import AdamW
from transformers import get_scheduler

# Define loss function (implicitly handled in Hugging Face's Trainer API)
criterion = torch.nn.CrossEntropyLoss()

# Optimizer: AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

# Learning rate scheduler
num_epochs = 3
num_training_steps = len(train_dataloader) * num_epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Print training parameters
print(f"Number of training steps: {num_training_steps}")
print(f"Optimizer: AdamW, Learning rate: {5e-5}")
print(f"Scheduler: Linear decay")

In [None]:
from torch.nn import functional as F
from tqdm import tqdm

# Define training parameters
epochs = 3
progress_bar = tqdm(range(num_training_steps))

for epoch in range(epochs):
    model.train()  # Set model to training mode
    for batch in train_dataloader:
        # Move data to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Rename the 'label' key to 'labels'
        batch["labels"] = batch.pop("label")

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        progress_bar.update(1)

    # Validation loop (optional for now)
    model.eval()  # Set model to evaluation mode
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            # Rename 'label' to 'labels'
            batch["labels"] = batch.pop("label")

            # Forward pass
            outputs = model(**batch)
            logits = outputs.logits

            # Compute validation loss
            val_loss += F.cross_entropy(logits, batch["labels"]).item()

            # Predictions and accuracy
            preds = torch.argmax(logits, dim=-1)
            correct += (preds == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    print(f"Epoch {epoch + 1}: Validation Loss = {val_loss / len(val_dataloader)}, Accuracy = {correct / total}")

In [None]:
from datasets import load_dataset
from torch.utils.data import DataLoader

# Load SST-2 dataset
dataset = load_dataset("glue", "sst2")
val_dataset = dataset["validation"]

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the validation set
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# Convert to PyTorch tensors
tokenized_val_dataset = tokenized_val_dataset.remove_columns(["sentence", "idx"])
tokenized_val_dataset.set_format("torch")

# Create a DataLoader
val_dataloader = DataLoader(tokenized_val_dataset, batch_size=16)

In [None]:
# Inference loop
correct = 0
total = 0

with torch.no_grad():
    for batch in val_dataloader:
        # Move data to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Rename 'label' to 'labels' to match the model's expectations
        batch["labels"] = batch.pop("label")
        
        # Forward pass
        outputs = finetuned_model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        # Update accuracy metrics
        correct += (predictions == batch["labels"]).sum().item()
        total += batch["labels"].size(0)

# Calculate accuracy
accuracy = correct / total
print(f"Accuracy with fine-tuning: {accuracy * 100:.2f}%")

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader

# Load SST-2 dataset
dataset = load_dataset("glue", "sst2")
val_dataset = dataset["validation"]

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the validation set
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = tokenized_val_dataset.remove_columns(["sentence", "idx"])
tokenized_val_dataset.set_format("torch")

# Create DataLoader
val_dataloader = DataLoader(tokenized_val_dataset, batch_size=16)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Pretrained BERT (model2 without fine-tuning)
model2 = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model2.eval()
model2.to(device)

# Evaluation function
def evaluate_model(model, dataloader, device):
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            # Move data to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Rename 'label' to 'labels'
            batch["labels"] = batch.pop("label")
            
            # Forward pass
            outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            
            # Update accuracy
            correct += (predictions == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    # Calculate accuracy
    return correct / total

# Evaluate Model2
accuracy = evaluate_model(model2, val_dataloader, device)
print(f"Accuracy without fine-tuning: {accuracy * 100:.2f}%")

In [None]:
!pip install shap

In [None]:
# Define the directory to save the model and tokenizer
save_directory = "./fine_tuned_model"

# Save the fine-tuned model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved successfully in {save_directory}!")


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Directory where the model and tokenizer are saved
save_directory = "./fine_tuned_model"

# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained(save_directory)
tokenizer = BertTokenizer.from_pretrained(save_directory)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

print("Fine-tuned model and tokenizer reloaded successfully!")

In [None]:
import shap
import torch
from transformers import BertTokenizer

# Ensure your model is in evaluation mode and on the correct device
model.eval()
model.to(device)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define SHAP prediction function
def predict_fn(texts):
    # Tokenize inputs (texts must be List[str])
    tokens = tokenizer(
        list(texts),  # Ensure the input is a List[str]
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    tokens = {key: val.to(device) for key, val in tokens.items()}
    
    # Get model logits and softmax probabilities
    logits = model(**tokens).logits
    return torch.nn.functional.softmax(logits, dim=-1).detach().cpu().numpy()

# Select a subset of the validation dataset for SHAP analysis
val_texts = dataset["validation"]["sentence"][:10]  # First 10 sentences for analysis

# Ensure `val_texts` is a List[str]
assert isinstance(val_texts, list) and all(isinstance(t, str) for t in val_texts), "val_texts must be a List[str]"

# Create SHAP explainer using PartitionExplainer
explainer = shap.Explainer(predict_fn, tokenizer)

# Compute SHAP values
shap_values = explainer(val_texts)

# Check dimensions of SHAP values and adapt for visualization
if hasattr(shap_values, "values"):
    shap_values_array = shap_values.values
    if shap_values_array.ndim == 2:
        shap_values_array = shap_values_array[:, :, None]  # Add dummy dimension for outputs
else:
    raise ValueError("SHAP explainer did not generate expected values format.")

# Visualize SHAP values (take the first output class for binary classification)
shap.summary_plot(shap_values_array[:, :, 1], val_texts)

In [None]:
# Inspect SHAP values
print("SHAP values object type:", type(shap_values))
if hasattr(shap_values, "values"):
    print("SHAP values shape:", getattr(shap_values.values, "shape", "No shape attribute"))
else:
    print("SHAP values do not have 'values' attribute.")

In [None]:
# Function to extract attention weights from the model
def extract_attention_weights(model, inputs):
    with torch.no_grad():
        outputs = model(**inputs)
        attention_weights = outputs.attentions  # Tuple of attention maps (num_layers, batch_size, num_heads, seq_len, seq_len)
    return attention_weights

In [None]:
import random
from transformers import pipeline

# Load a pre-trained paraphrasing model (T5 small)
paraphraser = pipeline("text2text-generation", model="t5-small", tokenizer="t5-small", device=0)  # Use GPU if available

# Function to replace words with synonyms
def synonym_replacement(sentence):
    synonyms = {
        "great": ["excellent", "fantastic", "wonderful"],
        "terrible": ["horrible", "dreadful", "awful"],
        "amazing": ["incredible", "phenomenal", "astonishing"],
        "awful": ["bad", "terrible", "dreadful"],
    }
    words = sentence.split()
    for i, word in enumerate(words):
        if word in synonyms:
            words[i] = random.choice(synonyms[word])
    return " ".join(words)

# Function to delete random words
def word_deletion(sentence):
    words = sentence.split()
    if len(words) > 1:
        del words[random.randint(0, len(words) - 1)]
    return " ".join(words)

# Function to swap adjacent words
def word_swapping(sentence):
    words = sentence.split()
    if len(words) > 1:
        idx = random.randint(0, len(words) - 2)
        words[idx], words[idx + 1] = words[idx + 1], words[idx]
    return " ".join(words)

# Function to introduce typos
def introduce_typos(sentence):
    words = list(sentence)
    if len(words) > 0:
        idx = random.randint(0, len(words) - 1)
        typo_choice = random.choice(["duplicate", "missing"])
        if typo_choice == "duplicate":
            words.insert(idx, words[idx])  # Duplicate a character
        elif typo_choice == "missing":
            del words[idx]  # Remove a character
    return "".join(words)

# Function to reorder phrases
def reorder_phrases(sentence):
    words = sentence.split(",")
    random.shuffle(words)
    return ",".join(words)

# Function to add noise (punctuation variations)
def add_noise(sentence):
    words = sentence.split()
    if len(words) > 1:
        idx = random.randint(0, len(words) - 1)
        words[idx] = words[idx] + random.choice([".", ",", "!", "?"])
    return " ".join(words)

# Function to generate robust perturbations
def robust_perturbation_final(sentence):
    perturbed = []

    # Lexical-level perturbations
    perturbed.append(synonym_replacement(sentence))
    perturbed.append(word_deletion(sentence))
    perturbed.append(word_swapping(sentence))

    # Noise injection
    perturbed.append(introduce_typos(sentence))
    perturbed.append(add_noise(sentence))

    # Structural changes
    reordered = reorder_phrases(sentence)
    if reordered != sentence:
        perturbed.append(reordered)

    # Paraphrasing with language control
    try:
        paraphrased = paraphraser(sentence, max_length=128, num_return_sequences=1)
        paraphrased_text = paraphrased[0]['generated_text']
        if paraphrased_text.isascii():  # Ensure English output
            perturbed.append(paraphrased_text)
    except Exception as e:
        print(f"Paraphrasing Error: {e}")

    # Contextual testing
    if "movie" in sentence:
        perturbed.append(sentence.replace("movie", "film"))

    # Filter meaningful, unique variations
    perturbed = list(set(p for p in perturbed if p != sentence and len(p.strip()) > 0 and p.lower() != sentence.lower()))
    return perturbed

In [None]:
# Test the function
sentence = "The movie was amazing and well-directed."
perturbations = robust_perturbation_final(sentence)
print("Original:", sentence)
print("Perturbations:", perturbations)

In [None]:
# Function to extract attention weights
def extract_attention_weights(model, tokenizer, sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs, output_attentions=True)
        attention_weights = outputs.attentions  # Tuple of attention maps (num_layers, batch_size, num_heads, seq_len, seq_len)
    
    return attention_weights

In [None]:
from scipy.spatial.distance import cosine
import numpy as np

# Function to compute similarity between attention maps
def compute_attention_similarity(att1, att2):
    similarities = []
    for layer in range(len(att1)):  # Iterate over layers
        for head in range(att1[layer].shape[1]):  # Iterate over attention heads
            for token_idx in range(att1[layer].shape[2]):  # Iterate over tokens
                # Flatten attention scores for each token
                original_att = att1[layer][0, head, token_idx].cpu().numpy()
                perturbed_att = att2[layer][0, head, token_idx].cpu().numpy()
                # Compute cosine similarity
                score = 1 - cosine(original_att, perturbed_att)
                similarities.append(score)
    return np.mean(similarities)  # Average similarity across layers, heads, and tokens

In [None]:
# Original sentence
original_sentence = "The movie was amazing and well-directed."

# Generate dynamic perturbations
perturbations = robust_perturbation_final(original_sentence)
print(f"Original Sentence: {original_sentence}")
print(f"Generated Perturbations: {perturbations}")

In [None]:
# Extract attention for the original sentence
original_attention = extract_attention_weights(model, tokenizer, original_sentence)

# Compare each perturbation
sac_scores = []
for perturbed_sentence in perturbations:
    perturbed_attention = extract_attention_weights(model, tokenizer, perturbed_sentence)
    score = compute_attention_similarity(original_attention, perturbed_attention)
    sac_scores.append(score)
    print(f"Perturbed Sentence: {perturbed_sentence} | SAC Score: {score:.4f}")

# Final SAC score (average across all perturbations)
final_sac_score = np.mean(sac_scores)
print(f"Final SAC Score: {final_sac_score:.4f}")

In [None]:
# Function to extract attention maps layer-wise
def extract_layer_attention_weights(model, tokenizer, sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs, output_attentions=True)
        attention_weights = outputs.attentions  # Tuple of (num_layers, batch_size, num_heads, seq_len, seq_len)
    
    return attention_weights  # List of attention maps for each layer

In [None]:
# Function to compute layer-wise similarity
def compute_layer_wise_similarity(att1, att2):
    layer_similarities = []
    
    for layer in range(len(att1)):  # Iterate over layers
        layer_scores = []
        for head in range(att1[layer].shape[1]):  # Iterate over attention heads
            for token_idx in range(att1[layer].shape[2]):  # Iterate over tokens
                # Flatten attention scores for each token
                original_att = att1[layer][0, head, token_idx].cpu().numpy()
                perturbed_att = att2[layer][0, head, token_idx].cpu().numpy()
                # Compute cosine similarity
                score = 1 - cosine(original_att, perturbed_att)
                layer_scores.append(score)
        
        # Average similarity for this layer
        layer_similarities.append(np.mean(layer_scores))
    
    return layer_similarities  # List of similarities for each layer

In [None]:
# Original sentence
original_sentence = "The movie was amazing and well-directed."

# Generate dynamic perturbations
perturbations = robust_perturbation_final(original_sentence)

# Extract attention for the original sentence
original_attention = extract_layer_attention_weights(fine_tuned_model, tokenizer, original_sentence)

# Initialize layer-wise scores
layer_wise_sac = np.zeros(len(original_attention))  # One score per layer

# Compare each perturbation
for perturbed_sentence in perturbations:
    perturbed_attention = extract_layer_attention_weights(model, tokenizer, perturbed_sentence)
    layer_scores = compute_layer_wise_similarity(original_attention, perturbed_attention)
    layer_wise_sac += np.array(layer_scores)

# Average SAC score per layer across all perturbations
layer_wise_sac /= len(perturbations)

# Print Layer-Wise SAC Results
for layer_idx, score in enumerate(layer_wise_sac):
    print(f"Layer {layer_idx + 1}: SAC Score = {score:.4f}")

# Optionally visualize
import matplotlib.pyplot as plt

plt.plot(range(1, len(layer_wise_sac) + 1), layer_wise_sac, marker='o')
plt.title("Layer-Wise SAC Scores")
plt.xlabel("Layer")
plt.ylabel("SAC Score")
plt.grid()
plt.show()

In [None]:
# Sentences to test
sentences = [
    "This film was beautifully directed and emotionally engaging.",
    "The plot was predictable and the acting was uninspired."
]

# Generate perturbations for each sentence
sentence_perturbations = {sentence: robust_perturbation_final(sentence) for sentence in sentences}

# Print perturbations for validation
for sentence, perturbations in sentence_perturbations.items():
    print(f"Original Sentence: {sentence}")
    print(f"Perturbations: {perturbations}\n")

In [None]:
# Layer-wise SAC analysis for multiple sentences
all_layer_sac = {}

for original_sentence, perturbations in sentence_perturbations.items():
    print(f"Analyzing sentence: {original_sentence}")
    
    # Extract attention for the original sentence
    original_attention = extract_layer_attention_weights(model, tokenizer, original_sentence)
    
    # Initialize layer-wise scores
    layer_wise_sac = np.zeros(len(original_attention))  # One score per layer
    
    # Compare each perturbation
    for perturbed_sentence in perturbations:
        perturbed_attention = extract_layer_attention_weights(finetuned_model, tokenizer, perturbed_sentence)
        layer_scores = compute_layer_wise_similarity(original_attention, perturbed_attention)
        layer_wise_sac += np.array(layer_scores)
    
    # Average SAC score per layer across all perturbations
    layer_wise_sac /= len(perturbations)
    all_layer_sac[original_sentence] = layer_wise_sac

# Print results
for sentence, layer_sac in all_layer_sac.items():
    print(f"\nSentence: {sentence}")
    for layer_idx, score in enumerate(layer_sac):
        print(f"Layer {layer_idx + 1}: SAC Score = {score:.4f}")

In [None]:
import matplotlib.pyplot as plt

# Plot SAC scores for each sentence
plt.figure(figsize=(10, 6))
for sentence, layer_sac in all_layer_sac.items():
    plt.plot(range(1, len(layer_sac) + 1), layer_sac, marker='o', label=sentence[:50] + "...")

plt.title("Layer-Wise SAC Scores Across Sentences")
plt.xlabel("Layer")
plt.ylabel("SAC Score")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Freeze specific layers
def freeze_layers(model, layers_to_freeze):
    for name, param in model.named_parameters():
        layer_number = None
        if "encoder.layer." in name:
            # Extract layer number
            layer_number = int(name.split("encoder.layer.")[1].split(".")[0])
        
        # Freeze specified layers
        if layer_number in layers_to_freeze:
            param.requires_grad = False

# Fine-tune Layers 4–9
weak_layers = [0, 1, 2, 10, 11]  # Layers to freeze (1–3 and 10–12)
freeze_layers(model, weak_layers)

In [None]:
from tqdm import tqdm
from torch.optim import AdamW
from transformers import get_scheduler

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_dataloader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Fine-tuning loop with tqdm progress bar
model.train()
for epoch in range(3):  # 3 epochs
    # Create a progress bar for each epoch
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")
    
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        if 'label' in batch:  # Rename 'label' to 'labels'
            batch['labels'] = batch.pop('label')

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar with loss
        progress_bar.set_postfix({"loss": loss.item()})
    
    print(f"Epoch {epoch + 1} completed. Loss = {loss.item()}")

# Save the fine-tuned model as model3
model.save_pretrained("./model3")
tokenizer.save_pretrained("./model3")

print("SAC-driven fine-tuned model saved successfully as model3!")

In [None]:
from tqdm.notebook import tqdm  # Use tqdm.notebook for Jupyter environments
from torch.optim import AdamW
from transformers import get_scheduler

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_dataloader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Fine-tuning loop with tqdm
model.train()
for epoch in range(3):  # 3 epochs
    print(f"Starting Epoch {epoch + 1}")
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}", dynamic_ncols=True)

    for i, batch in enumerate(progress_bar):
        batch = {k: v.to(device) for k, v in batch.items()}
        if 'label' in batch:
            batch['labels'] = batch.pop('label')

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar with loss
        progress_bar.set_postfix({"loss": loss.item()})

    print(f"Epoch {epoch + 1} completed. Loss = {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("./model3")
tokenizer.save_pretrained("./model3")

print("SAC-driven fine-tuned model saved successfully as model3!")

In [None]:
# Reload model3 for evaluation
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Load the fine-tuned model3
model3 = BertForSequenceClassification.from_pretrained("./model3")
tokenizer = BertTokenizer.from_pretrained("./model3")

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model3.to(device)
model3.eval()

print("model3 loaded successfully for SAC evaluation!")

In [None]:
# Recompute SAC scores for model3
all_layer_sac_model3 = {}

for original_sentence, perturbations in sentence_perturbations.items():
    print(f"Analyzing sentence: {original_sentence} with model3")
    
    # Extract attention for the original sentence
    original_attention_model3 = extract_layer_attention_weights(model3, tokenizer, original_sentence)
    
    # Initialize layer-wise SAC scores
    layer_wise_sac_model3 = np.zeros(len(original_attention_model3))  # One score per layer

    # Compare each perturbation
    for perturbed_sentence in perturbations:
        perturbed_attention_model3 = extract_layer_attention_weights(model3, tokenizer, perturbed_sentence)
        layer_scores_model3 = compute_layer_wise_similarity(original_attention_model3, perturbed_attention_model3)
        layer_wise_sac_model3 += np.array(layer_scores_model3)
    
    # Average SAC score per layer across all perturbations
    layer_wise_sac_model3 /= len(perturbations)
    all_layer_sac_model3[original_sentence] = layer_wise_sac_model3

# Print results
for sentence, layer_sac in all_layer_sac_model3.items():
    print(f"\nSentence: {sentence}")
    for layer_idx, score in enumerate(layer_sac):
        print(f"Layer {layer_idx + 1}: SAC Score = {score:.4f}")

In [None]:
import matplotlib.pyplot as plt

# Plot SAC scores for each sentence (before and after fine-tuning)
plt.figure(figsize=(10, 6))
for sentence, sac_original in all_layer_sac.items():
    sac_model3 = all_layer_sac_model3[sentence]
    plt.plot(range(1, len(sac_original) + 1), sac_original, marker='o', label=f"Original {sentence[:50]}...")
    plt.plot(range(1, len(sac_model3) + 1), sac_model3, marker='x', label=f"Model3 {sentence[:50]}...")

plt.title("Layer-Wise SAC Scores (Original vs Fine-Tuned)")
plt.xlabel("Layer")
plt.ylabel("SAC Score")
plt.legend()
plt.grid()
plt.show()

In [None]:
from transformers import BertConfig, BertForSequenceClassification

# Load model3's configuration
config = BertConfig.from_pretrained("./model3")
config.hidden_dropout_prob = 0.1  # Increase dropout for better regularization
config.attention_probs_dropout_prob = 0.1

# Load the model with modified dropout
model3_with_dropout = BertForSequenceClassification.from_pretrained("./model3", config=config)
model3_with_dropout.to(device)

model3.save_pretrained(f"./model3_variant_dropout")
tokenizer.save_pretrained(f"./model3_variant_dropout")

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Directory where the model3_variant_dropout is saved
variant_path = "./model3_variant_dropout"

# Load the model and tokenizer
model3_variant_dropout = BertForSequenceClassification.from_pretrained(variant_path)
tokenizer = BertTokenizer.from_pretrained(variant_path)

# Move the model to the appropriate device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model3_variant_dropout.to(device)
model3_variant_dropout.eval()

print("model3_variant_dropout loaded successfully!")

In [None]:
for original_sentence, perturbations in sentence_perturbations.items():
    print(f"Analyzing sentence: {original_sentence} with model3_variant_dropout")
    
    # Extract attention weights for the original sentence
    original_attention = extract_layer_attention_weights(model3_variant_dropout, tokenizer, original_sentence)

    layer_wise_sac = np.zeros(len(original_attention))  # Initialize SAC scores for each layer
    
    # Compare each perturbation
    for perturbed_sentence in perturbations:
        perturbed_attention = extract_layer_attention_weights(model3_variant_dropout, tokenizer, perturbed_sentence)
        layer_scores = compute_layer_wise_similarity(original_attention, perturbed_attention)
        layer_wise_sac += np.array(layer_scores)
    
    # Average SAC scores across all perturbations
    layer_wise_sac /= len(perturbations)
    print(f"Layer-wise SAC scores for model3_variant_dropout: {layer_wise_sac}")

In [None]:
import matplotlib.pyplot as plt

# Plot SAC scores for each sentence (before and after fine-tuning)
plt.figure(figsize=(10, 6))
for sentence, sac_original in all_layer_sac.items():
    sac_model3 = all_layer_sac_model3[sentence]
    plt.plot(range(1, len(sac_original) + 1), sac_original, marker='o', label=f"Original {sentence[:50]}...")
    plt.plot(range(1, len(sac_model3) + 1), sac_model3, marker='x', label=f"Model3 {sentence[:50]}...")

plt.title("Layer-Wise SAC Scores (Original vs Fine-Tuned)")
plt.xlabel("Layer")
plt.ylabel("SAC Score")
plt.legend()
plt.grid()
plt.show()

In [None]:
from transformers.models.bert.modeling_bert import BertLayer

class CustomBertLayer(BertLayer):
    def forward(self, hidden_states, *args, **kwargs):
        layer_output = super().forward(hidden_states, *args, **kwargs)
        if self.layer_index in range(4, 10):  # Mid-layers
            layer_output = torch.nn.functional.relu(layer_output)
        return layer_output

# Replace the mid-layers in model3
for idx in range(4, 10):
    model3.bert.encoder.layer[idx] = CustomBertLayer(model3.bert.encoder.layer[idx])

model3.save_pretrained(f"./model3_variant_relu")
tokenizer.save_pretrained(f"./model3_variant_relu")

In [None]:
import torch
import torch.nn as nn
import numpy as np
from transformers import BertForSequenceClassification, BertTokenizer

# Load the fine-tuned model3 and tokenizer
model3 = BertForSequenceClassification.from_pretrained("./model3", output_attentions=True, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained("./model3")

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model3.to(device)

class EnhancedBertModel(nn.Module):
    def __init__(self, base_model):
        super(EnhancedBertModel, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(0.1)
        self.task_head = nn.Linear(base_model.config.hidden_size, 2)  # Assuming binary classification

        # Learnable scaling factors for mid-layers (Layers 4–9)
        self.scaling_factors = nn.Parameter(torch.ones(6))  # For Layers 4–9
        
        # Auxiliary head for intermediate supervision
        self.auxiliary_head = nn.Linear(base_model.config.hidden_size, 2)

        # Regularization weight for layer-wise attention
        self.attention_reg_weight = 0.1

    def forward(self, input_ids, attention_mask, token_type_ids=None, auxiliary_labels=None):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_attentions=True,
            output_hidden_states=True
        )
        
        hidden_states = outputs.hidden_states
        attentions = outputs.attentions

        # Enhanced residuals: Add attention residuals for mid-layers
        mid_layer_outputs = []
        for i, hidden_state in enumerate(hidden_states[4:10]):  # Layers 4–9
            scaled_hidden = self.scaling_factors[i] * hidden_state
            residual_output = scaled_hidden + hidden_states[3 + i]  # Enhanced residual
            mid_layer_outputs.append(residual_output)

        # Use the [CLS] token representation for classification
        cls_representation = hidden_states[-1][:, 0, :]  # [CLS] token is at index 0
        task_output = self.task_head(self.dropout(cls_representation))

        # Auxiliary supervision loss (if labels provided)
        aux_loss = None
        if auxiliary_labels is not None:
            aux_output = self.auxiliary_head(cls_representation)
            aux_loss = nn.CrossEntropyLoss()(aux_output, auxiliary_labels)

        return task_output, aux_loss, attentions, mid_layer_outputs


    def compute_attention_regularization(self, attentions):
        """
        Apply layer-wise attention regularization to minimize divergence
        between adjacent layers in mid-layer attentions.
        """
        reg_loss = 0.0
        for i in range(4, 9):  # Layers 4 to 9
            reg_loss += torch.mean((attentions[i] - attentions[i + 1]) ** 2)  # L2 regularization
        return self.attention_reg_weight * reg_loss

# Enhance model3 with the new architecture
enhanced_model3 = EnhancedBertModel(model3)
enhanced_model3.to(device)

# Helper functions
def extract_layer_attention_weights(model, tokenizer, sentence):
    inputs = tokenizer(sentence, return_tensors="pt").to(device)
    outputs = model(**inputs)
    return outputs[2]  # Return attention weights (index 2 in the tuple)

def compute_layer_wise_similarity(original_attention, perturbed_attention):
    """
    Compute cosine similarity for attention maps between original and perturbed inputs,
    aligning their shapes if necessary.
    """
    layer_scores = []
    for orig, pert in zip(original_attention, perturbed_attention):
        # Convert to numpy arrays
        orig = orig.squeeze().detach().cpu().numpy()
        pert = pert.squeeze().detach().cpu().numpy()

        # Align shapes (truncate or pad to match dimensions)
        min_len = min(orig.shape[1], pert.shape[1])
        orig = orig[:, :min_len, :min_len]  # Truncate to min sequence length
        pert = pert[:, :min_len, :min_len]  # Truncate to min sequence length

        # Compute cosine similarity
        sim = np.dot(orig.flatten(), pert.flatten()) / (
            np.linalg.norm(orig.flatten()) * np.linalg.norm(pert.flatten()) + 1e-8
        )
        layer_scores.append(sim)
    return layer_scores

# Evaluate SAC scores
sentence_perturbations = {
    "The cat is on the mat.": ["The cat is over the mat.", "The cat's on the mat!", "The mat is under the cat."],
    # Add more sentence-perturbation pairs as needed
}

print("Evaluating SAC scores with Enhanced Model...")
enhanced_model3.eval()

for original_sentence, perturbations in sentence_perturbations.items():
    print(f"Analyzing sentence: {original_sentence}")
    
    # Extract attention weights for the original sentence
    original_attention = extract_layer_attention_weights(enhanced_model3, tokenizer, original_sentence)
    
    layer_wise_sac = np.zeros(len(original_attention))  # Initialize SAC scores for each layer
    
    # Compare each perturbation
    for perturbed_sentence in perturbations:
        perturbed_attention = extract_layer_attention_weights(enhanced_model3, tokenizer, perturbed_sentence)
        layer_scores = compute_layer_wise_similarity(original_attention, perturbed_attention)
        layer_wise_sac += np.array(layer_scores)
    
    # Average SAC scores across all perturbations
    layer_wise_sac /= len(perturbations)
    print(f"Layer-wise SAC scores for Enhanced Model: {layer_wise_sac}")

In [None]:
# Inference loop for accuracy evaluation
correct = 0
total = 0

enhanced_model3.eval()  # Set the model to evaluation mode

with torch.no_grad():
    for batch in val_dataloader:
        # Move data to the appropriate device (GPU or CPU)
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Rename 'label' to 'labels' if necessary
        if "label" in batch:
            batch["labels"] = batch.pop("label")
        
        # Forward pass
        task_output, _, _, _ = model3(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            token_type_ids=batch.get("token_type_ids", None),  # Handle token_type_ids if present
        )
        
        # Get predictions from logits
        predictions = torch.argmax(task_output, dim=-1)
        
        # Update accuracy metrics
        correct += (predictions == batch["labels"]).sum().item()
        total += batch["labels"].size(0)

# Calculate accuracy
accuracy = correct / total
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

In [None]:
import torch
import torch.nn as nn
import numpy as np
from transformers import BertForSequenceClassification, BertTokenizer

# Load the fine-tuned model3 and tokenizer
model3 = BertForSequenceClassification.from_pretrained("./model3", output_attentions=True, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained("./model3")

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model3.to(device)

class EnhancedBertModel(nn.Module):
    def __init__(self, base_model):
        super(EnhancedBertModel, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(0.1)
        self.task_head = nn.Linear(base_model.config.hidden_size, 2)  # Assuming binary classification

        # Cross-layer attention weights for dynamic interaction
        self.cross_layer_weights = nn.Parameter(torch.tensor([0.33, 0.33, 0.33]))  # Equal weights initially
        
        # Learnable scaling factors for mid-layers (Layers 4–9)
        self.scaling_factors = nn.Parameter(torch.ones(6))  # For Layers 4–9

        # Regularization weight for layer-wise attention
        self.regularization_weight = 0.05  # Lower starting regularization weight

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_attentions=True,
            output_hidden_states=True
        )
        
        hidden_states = outputs.hidden_states
        attentions = outputs.attentions

        # Cross-layer dynamic interaction
        lower_output = hidden_states[3]  # Lower layers
        mid_output = torch.mean(torch.stack(hidden_states[4:10]), dim=0)  # Mid-layers
        upper_output = hidden_states[-1]  # Upper layers
        cross_output = (
            self.cross_layer_weights[0] * lower_output +
            self.cross_layer_weights[1] * mid_output +
            self.cross_layer_weights[2] * upper_output
        )

        # Use the [CLS] token representation for classification
        cls_representation = cross_output[:, 0, :]  # [CLS] token at index 0
        task_output = self.task_head(self.dropout(cls_representation))

        return task_output, attentions, hidden_states

    def compute_attention_regularization(self, attentions):
        """
        Apply layer-wise attention regularization to minimize divergence
        between specific mid-layer attention maps.
        """
        reg_loss = 0.0
        for i in range(5, 8):  # Apply regularization only to Layers 6–8
            reg_loss += torch.mean((attentions[i] - attentions[i + 1]) ** 2)  # L2 regularization
        return self.regularization_weight * reg_loss

# Enhance model3 with the new architecture
enhanced_model3 = EnhancedBertModel(model3)
enhanced_model3.to(device)

# Helper functions
def extract_layer_attention_weights(model, tokenizer, sentence):
    inputs = tokenizer(sentence, return_tensors="pt").to(device)
    outputs = model(**inputs)
    return outputs[1]  # Return attention weights (index 1 in the tuple)

def compute_layer_wise_similarity(original_attention, perturbed_attention):
    """
    Compute cosine similarity for attention maps between original and perturbed inputs,
    aligning their shapes if necessary.
    """
    layer_scores = []
    for orig, pert in zip(original_attention, perturbed_attention):
        # Convert to numpy arrays
        orig = orig.squeeze().detach().cpu().numpy()
        pert = pert.squeeze().detach().cpu().numpy()

        # Align shapes (truncate or pad to match dimensions)
        min_len = min(orig.shape[1], pert.shape[1])
        orig = orig[:, :min_len, :min_len]  # Truncate to min sequence length
        pert = pert[:, :min_len, :min_len]  # Truncate to min sequence length

        # Compute cosine similarity
        sim = np.dot(orig.flatten(), pert.flatten()) / (
            np.linalg.norm(orig.flatten()) * np.linalg.norm(pert.flatten()) + 1e-8
        )
        layer_scores.append(sim)
    return layer_scores

# Evaluate SAC scores
sentence_perturbations = {
    "The cat is on the mat.": ["The cat is over the mat.", "The cat's on the mat!", "The mat is under the cat."],
    # Add more sentence-perturbation pairs as needed
}

print("Evaluating SAC scores with Enhanced Model...")
enhanced_model3.eval()

for original_sentence, perturbations in sentence_perturbations.items():
    print(f"Analyzing sentence: {original_sentence}")
    
    # Extract attention weights for the original sentence
    original_attention = extract_layer_attention_weights(enhanced_model3, tokenizer, original_sentence)
    
    layer_wise_sac = np.zeros(len(original_attention))  # Initialize SAC scores for each layer
    
    # Compare each perturbation
    for perturbed_sentence in perturbations:
        perturbed_attention = extract_layer_attention_weights(enhanced_model3, tokenizer, perturbed_sentence)
        layer_scores = compute_layer_wise_similarity(original_attention, perturbed_attention)
        layer_wise_sac += np.array(layer_scores)
    
    # Average SAC scores across all perturbations
    layer_wise_sac /= len(perturbations)
    print(f"Layer-wise SAC scores for Enhanced Model: {layer_wise_sac}")

In [None]:
import torch
import torch.nn as nn
import numpy as np
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader

# Load model3 and tokenizer
model3 = BertForSequenceClassification.from_pretrained("./model3", output_attentions=True, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained("./model3")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Enhanced2BertModel(nn.Module):
    def __init__(self, base_model):
        super(Enhanced2BertModel, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(0.1)
        self.task_head = nn.Linear(base_model.config.hidden_size, 2)  # Task-specific head
        self.auxiliary_head = nn.Linear(base_model.config.hidden_size, 2)  # Auxiliary head

        # Cross-layer attention weights for dynamic interaction
        self.cross_layer_weights = nn.Parameter(torch.tensor([0.2, 0.6, 0.2]))  # Prioritize mid-layers
        self.scaling_factors = nn.Parameter(torch.ones(6))  # Learnable scaling factors for mid-layers
        self.regularization_weight = 0.01  # Starting regularization weight

    def forward(self, input_ids, attention_mask, token_type_ids=None, auxiliary_labels=None):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_attentions=True,
            output_hidden_states=True
        )
        hidden_states = outputs.hidden_states
        attentions = outputs.attentions

        # Cross-layer dynamic interaction
        lower_output = hidden_states[3]  # Lower layers
        mid_output = torch.mean(torch.stack(hidden_states[4:10]), dim=0)  # Mid-layers
        upper_output = hidden_states[-1]  # Upper layers
        cross_output = (
            self.cross_layer_weights[0] * lower_output +
            self.cross_layer_weights[1] * mid_output +
            self.cross_layer_weights[2] * upper_output
        )

        # Apply dropout to cross-layer outputs
        cross_output = self.dropout(cross_output)

        # Use the [CLS] token for classification
        cls_representation = cross_output[:, 0, :]
        task_output = self.task_head(cls_representation)

        # Auxiliary supervision
        aux_loss = None
        if auxiliary_labels is not None:
            aux_output = self.auxiliary_head(cls_representation)
            aux_loss = nn.CrossEntropyLoss()(aux_output, auxiliary_labels)

        return task_output, aux_loss, attentions, hidden_states

    def compute_attention_regularization(self, attentions):
        # Regularize mid-layer attention transitions (Layers 6–8)
        reg_loss = 0.0
        for i in range(5, 8):
            reg_loss += torch.mean((attentions[i] - attentions[i + 1]) ** 2)
        return self.regularization_weight * reg_loss

# Instantiate the enhanced model
enhanced2_model3 = Enhanced2BertModel(model3)
enhanced2_model3.to(device)

# Optimizer with layer-specific learning rates
# Optimizer with layer-specific learning rates
optimizer = torch.optim.AdamW([
    {"params": enhanced2_model3.base_model.bert.encoder.layer[:4].parameters(), "lr": 1e-5},  # Lower layers
    {"params": enhanced2_model3.base_model.bert.encoder.layer[4:10].parameters(), "lr": 2e-5},  # Mid-layers
    {"params": enhanced2_model3.base_model.bert.encoder.layer[10:].parameters(), "lr": 1e-5},  # Upper layers
    {"params": enhanced2_model3.task_head.parameters(), "lr": 2e-5},  # Task-specific head
    {"params": enhanced2_model3.auxiliary_head.parameters(), "lr": 2e-5},  # Auxiliary head
])

# Training Loop
from tqdm.notebook import tqdm

# Training Loop with Progress Bar
num_epochs = 3
for epoch in range(num_epochs):
    enhanced2_model3.train()
    total_loss = 0

    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_progress_bar = tqdm(train_dataloader, desc="Training", leave=False)

    for batch in train_progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}

        if "label" in batch:
            batch["labels"] = batch.pop("label")

        # Forward pass
        task_output, aux_loss, attentions, _ = enhanced2_model3(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            token_type_ids=batch.get("token_type_ids", None),
            auxiliary_labels=batch.get("auxiliary_labels", None)
        )

        # Compute losses
        task_loss = nn.CrossEntropyLoss()(task_output, batch["labels"])
        reg_loss = enhanced2_model3.compute_attention_regularization(attentions)
        loss = task_loss + reg_loss + (aux_loss if aux_loss is not None else 0)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Update progress bar with current loss
        train_progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1}, Training Loss: {total_loss:.4f}")

    # Validation Loop with Progress Bar
    enhanced2_model3.eval()
    correct, total = 0, 0
    val_progress_bar = tqdm(val_dataloader, desc="Validating", leave=False)

    with torch.no_grad():
        for batch in val_progress_bar:
            batch = {k: v.to(device) for k, v in batch.items()}
            if "label" in batch:
                batch["labels"] = batch.pop("label")

            # Forward pass
            task_output, _, attentions, _ = enhanced2_model3(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch.get("token_type_ids", None)
            )

            # Predictions
            predictions = torch.argmax(task_output, dim=-1)
            correct += (predictions == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

            # Update progress bar with running accuracy
            val_progress_bar.set_postfix(accuracy=(correct / total) * 100)

    accuracy = correct / total
    print(f"Epoch {epoch + 1}, Validation Accuracy: {accuracy * 100:.2f}%")

In [None]:
from torch.nn.functional import mse_loss

def compute_sac_scores(model, dataloader, tokenizer, device):
    model.eval()
    sac_scores = {layer: [] for layer in range(12)}  # Assuming 12 layers in BERT

    with torch.no_grad():
        for batch in dataloader:
            # Prepare inputs
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Generate perturbed inputs (example: adding noise)
            perturbed_ids = input_ids.clone()  # Clone and perturb
            perturbed_ids[:, 1] = tokenizer.convert_tokens_to_ids("[MASK]")  # Example perturbation

            # Get attentions for original and perturbed inputs
            _, _, original_attentions, _ = model(input_ids, attention_mask)
            _, _, perturbed_attentions, _ = model(perturbed_ids, attention_mask)

            # Compute SAC for each layer
            for layer in range(len(original_attentions)):
                original = original_attentions[layer]
                perturbed = perturbed_attentions[layer]
                sac_score = mse_loss(original, perturbed).item()
                sac_scores[layer].append(sac_score)

    # Aggregate SAC scores per layer
    avg_sac_scores = {layer: np.mean(scores) for layer, scores in sac_scores.items()}
    return avg_sac_scores

In [None]:
def evaluate_accuracy(model, dataloader, device):
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            if "label" in batch:
                batch["labels"] = batch.pop("label")

            # Forward pass
            task_output, _, _, _ = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch.get("token_type_ids", None)
            )

            # Predictions
            predictions = torch.argmax(task_output, dim=-1)
            correct += (predictions == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    accuracy = correct / total
    return accuracy

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np
from torch.nn.functional import cosine_similarity

def analyze_sentence(model, tokenizer, sentence, perturbed_sentence, device, max_length=128):
    # Tokenize and encode both sentences with a fixed max_length
    encoded_orig = tokenizer(sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length).to(device)
    encoded_pert = tokenizer(perturbed_sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length).to(device)

    # Enable attentions and hidden states
    model.config.output_attentions = True
    model.config.output_hidden_states = True

    # Get attention weights for both
    with torch.no_grad():
        original_output = model(**encoded_orig)
        perturbed_output = model(**encoded_pert)

        # Extract attentions
        original_attentions = original_output.attentions
        perturbed_attentions = perturbed_output.attentions

    # Compute SAC scores layer-wise
    sac_scores = []
    for layer in range(len(original_attentions)):
        original = original_attentions[layer].squeeze(0)  # Shape: (num_heads, seq_len, seq_len)
        perturbed = perturbed_attentions[layer].squeeze(0)

        # Align sizes if necessary
        min_seq_len = min(original.shape[-1], perturbed.shape[-1])
        original = original[:, :min_seq_len, :min_seq_len]
        perturbed = perturbed[:, :min_seq_len, :min_seq_len]

        # Flatten and compute cosine similarity
        original_flat = original.view(-1)
        perturbed_flat = perturbed.view(-1)
        sac_score = cosine_similarity(original_flat, perturbed_flat, dim=0).item()

        sac_scores.append(sac_score)

    # Print formatted results
    print(f"Sentence: {sentence}")
    for layer, score in enumerate(sac_scores, start=1):
        print(f"Layer {layer}: SAC Score = {score:.4f}")

    return sac_scores
# Example usage
model = BertForSequenceClassification.from_pretrained("./model3", output_attentions=True, output_hidden_states=True).to(device)
tokenizer = BertTokenizer.from_pretrained("./model3")

sentence_1 = "This film was beautifully directed and emotionally engaging."
perturbed_sentence_1 = "This movie was beautifully directed and emotionally touching."

sentence_2 = "The plot was predictable and the acting was uninspired."
perturbed_sentence_2 = "The storyline was obvious and the performance was dull."

# Analyze both sentences
analyze_sentence(model, tokenizer, sentence_1, perturbed_sentence_1, device)
analyze_sentence(model, tokenizer, sentence_2, perturbed_sentence_2, device)

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

# Reload the fine-tuned model and tokenizer with updated configuration
fine_tuned_model = BertForSequenceClassification.from_pretrained(
    "./fine_tuned_model",
    output_attentions=True,  # Ensure attentions are returned
    output_hidden_states=True
)
fine_tuned_tokenizer = BertTokenizer.from_pretrained("./fine_tuned_model")
fine_tuned_model.to(device)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fine_tuned_model.to(device)
enhanced2_model3.to(device)

In [None]:
def evaluate_accuracy(model, tokenizer, dataloader, device, is_enhanced=False):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch.pop("label")  # Extract labels

            if is_enhanced:
                # For Enhanced2BertModel: Use outputs without 'labels'
                task_output, _, _, _ = model(**batch)
                logits = task_output
            else:
                # For BertForSequenceClassification: Use outputs with 'labels'
                outputs = model(**batch)
                logits = outputs.logits

            preds = torch.argmax(logits, dim=-1)

            # Compute accuracy
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return correct / total

In [None]:
from torch.nn.functional import cosine_similarity

def compute_sac_scores(model, tokenizer, sentences, perturbed_sentences, device, is_enhanced=False):
    model.eval()
    sac_scores = {layer: [] for layer in range(12)}  # Assuming 12 layers in BERT

    for orig_sent, pert_sent in zip(sentences, perturbed_sentences):
        # Tokenize sentences
        orig_inputs = tokenizer(orig_sent, return_tensors="pt", padding=True, truncation=True).to(device)
        pert_inputs = tokenizer(pert_sent, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            # Forward pass
            if is_enhanced:
                orig_outputs = model(**orig_inputs)
                pert_outputs = model(**pert_inputs)
                orig_attentions = orig_outputs[2]  # Extract attentions (third item in tuple)
                pert_attentions = pert_outputs[2]
            else:
                orig_outputs = model(**orig_inputs)
                pert_outputs = model(**pert_inputs)
                orig_attentions = orig_outputs.attentions
                pert_attentions = pert_outputs.attentions

            if orig_attentions is None or pert_attentions is None:
                raise ValueError(
                    "Model did not return attention weights. Ensure `output_attentions=True` in the model configuration."
                )

        # Compute SAC score per layer
        for layer in range(len(orig_attentions)):
            orig_layer = orig_attentions[layer].squeeze(0)
            pert_layer = pert_attentions[layer].squeeze(0)

            # Flatten and calculate cosine similarity
            orig_flat = orig_layer.view(-1)
            pert_flat = pert_layer.view(-1)
            sac_score = cosine_similarity(orig_flat, pert_flat, dim=0).item()
            sac_scores[layer].append(sac_score)

    # Average SAC scores per layer
    avg_sac_scores = {layer: sum(scores) / len(scores) for layer, scores in sac_scores.items()}
    return avg_sac_scores

In [None]:
def generate_perturbed_sentences(sentences):
    def perturb(sentence):
        synonym_dict = {"film": "movie", "engaging": "captivating", "predictable": "obvious"}
        words = sentence.split()
        perturbed = [synonym_dict.get(word, word) for word in words]
        return " ".join(perturbed)
    
    return [perturb(sentence) for sentence in sentences]

In [None]:
from torch.utils.data import DataLoader, Dataset

class CustomDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length=128):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(sentence, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Define your original dataset
original_sentences = ["This film was beautifully directed and emotionally engaging.",
                      "The plot was predictable and the acting was uninspired."]
original_labels = [1, 0]  # Replace with the actual labels for the sentences

# Create DataLoader for the original dataset
original_dataset = CustomDataset(original_sentences, original_labels, fine_tuned_tokenizer)
original_dataloader = DataLoader(original_dataset, batch_size=8, shuffle=False)

In [None]:
# Generate perturbed sentences
perturbed_sentences = generate_perturbed_sentences(original_sentences)

# Create DataLoader for the perturbed dataset
perturbed_dataset = CustomDataset(perturbed_sentences, original_labels, fine_tuned_tokenizer)
perturbed_dataloader = DataLoader(perturbed_dataset, batch_size=8, shuffle=False)

In [None]:
# Evaluate accuracy
# Fine-tuned model accuracy
fine_tuned_acc_original = evaluate_accuracy(fine_tuned_model, fine_tuned_tokenizer, original_dataloader, device)
fine_tuned_acc_perturbed = evaluate_accuracy(fine_tuned_model, fine_tuned_tokenizer, perturbed_dataloader, device)

# Enhanced model accuracy
enhanced_acc_original = evaluate_accuracy(enhanced2_model3, fine_tuned_tokenizer, original_dataloader, device, is_enhanced=True)
enhanced_acc_perturbed = evaluate_accuracy(enhanced2_model3, fine_tuned_tokenizer, perturbed_dataloader, device, is_enhanced=True)

# Compute SAC scores
fine_tuned_sac = compute_sac_scores(fine_tuned_model, fine_tuned_tokenizer, original_sentences, perturbed_sentences, device)
enhanced_sac = compute_sac_scores(enhanced2_model3, fine_tuned_tokenizer, original_sentences, perturbed_sentences, device)

# Print Results
print(f"Fine-Tuned Model: Original Accuracy = {fine_tuned_acc_original:.2f}, Perturbed Accuracy = {fine_tuned_acc_perturbed:.2f}")
print(f"Enhanced Model: Original Accuracy = {enhanced_acc_original:.2f}, Perturbed Accuracy = {enhanced_acc_perturbed:.2f}")
print("\nSAC Scores (Fine-Tuned Model):", fine_tuned_sac)
print("\nSAC Scores (Enhanced Model):", enhanced_sac)

In [None]:
# Test attention outputs
test_sentence = "This film was beautifully directed."
inputs = fine_tuned_tokenizer(test_sentence, return_tensors="pt", padding=True, truncation=True).to(device)
with torch.no_grad():
    outputs = fine_tuned_model(**inputs)
    if outputs.attentions is None:
        raise ValueError("Attention weights are still not being returned. Check the model configuration.")
    else:
        print(f"Number of attention layers: {len(outputs.attentions)}")