In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

nltk.download('wordnet')
import torch
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import nltk
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
from datasets import load_dataset

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/moni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# path to checkpoint
checkpoint_folder = "./New Folder With Items"

# loading model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_folder)

# loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint_folder)

In [None]:
def evaluate_model_on_dataset(checkpoint_folder, dataset_name, split='test', json=False):
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint_folder)
    tokenizer = AutoTokenizer.from_pretrained(checkpoint_folder)
    
    if json:
        dataset = load_dataset('json', data_files=dataset_name, split=split)
    else:
        dataset = load_dataset(dataset_name, split=split)
    
    def preprocess_function(examples):
        return tokenizer(examples['premise'], examples['hypothesis'], truncation=True)

    dataset = dataset.map(preprocess_function, batched=True)
    
    trainer = Trainer(model=model, tokenizer=tokenizer)
    results = trainer.predict(dataset)

    preds, labels = get_predictions_and_labels(results)

    overall_accuracy = calculate_overall_accuracy(preds, labels)
    print(f"Overall accuracy: {overall_accuracy:.4f}")
    
    accuracy_per_class = calculate_class_accuracy(preds, labels)
    print(f"Accuracy per class: {accuracy_per_class}")

    comparison_table = create_comparison_table(dataset, preds, labels)
    return comparison_table, overall_accuracy, accuracy_per_class

def get_predictions_and_labels(results):
    preds = np.argmax(results.predictions, axis=1)
    labels = results.label_ids
    return preds, labels

def calculate_overall_accuracy(predictions, labels):
    return np.sum(predictions == labels) / len(labels)

def calculate_class_accuracy(predictions, labels):
    class_labels = [0, 1, 2] # 0: entailment, 1: neutral, 2: contradiction
    accuracies = {}
    for class_label in class_labels:
        class_indices = np.where(labels == class_label)[0]
        correct_predictions = np.sum(predictions[class_indices] == labels[class_indices])
        accuracies[class_label] = correct_predictions / len(class_indices)
    return accuracies

def create_comparison_table(dataset, predictions, labels):
    wrong_indices = np.where(predictions != labels)[0]
    comparison_data = {
        "Premise": [dataset[int(i)]["premise"] for i in wrong_indices[:10]], 
        "Hypothesis": [dataset[int(i)]["hypothesis"] for i in wrong_indices[:10]], 
        "Ground Truth": [labels[int(i)] for i in wrong_indices[:10]],
        "Prediction": [predictions[int(i)] for i in wrong_indices[:10]], 
    }
    comparison_df = pd.DataFrame(comparison_data)
    return comparison_df

def analyze_attention(model_checkpoint, premise, hypothesis, device='cpu'):
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, output_attentions=True)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    # extract attention weights
    attention_weights, inputs = get_attention_weights(model, tokenizer, premise, hypothesis, device=device)

    # get average attention for each token across all heads and layers
    avg_attention_matrix = average_attention_for_each_token(attention_weights)

    # get input tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    # plot average attention matrix
    plot_avg_attention_matrix(avg_attention_matrix, tokens)

# function to extract attention weights
def get_attention_weights(model, tokenizer, premise, hypothesis, device='cpu'):
    inputs = tokenizer(premise, hypothesis, return_tensors='pt', truncation=True).to(device)
    model = model.to(device) 
    outputs = model(**inputs)

    # get attention weights from model outputs
    attention_weights = outputs.attentions
    return attention_weights, inputs

# function to calculate average attention for each token across all heads and layers
def average_attention_for_each_token(attention_weights):
    num_layers = len(attention_weights)
    num_heads = attention_weights[0].shape[1]
    
    # zero matrix for average attention
    avg_attention = torch.zeros(attention_weights[0].shape[-1], attention_weights[0].shape[-1]).to(attention_weights[0].device)
    
    # sum attention across all heads and layers for each token pair
    for layer_attention in attention_weights:
        avg_attention += layer_attention.mean(dim=1)[0] 
    
    # average across all layers
    avg_attention /= num_layers # normalize by num layers
    return avg_attention

def plot_avg_attention_matrix(avg_attention_matrix, tokens):
    # ensure it's on cpu for plotting
    avg_attention_matrix = avg_attention_matrix.detach().cpu().numpy()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(avg_attention_matrix, xticklabels=tokens, yticklabels=tokens, cmap="coolwarm", annot=True, fmt=".2f")
    plt.title("Average attention across all layers and heads for each token")
    plt.show()

In [None]:
comparison_table_snli, overall_accuracy_snli, accuracy_per_class_snli = evaluate_model_on_dataset(checkpoint_folder, 'snli')

  0%|          | 0/1250 [00:00<?, ?it/s]

Overall Accuracy: 0.8733
Accuracy per class: {0: 0.9103325415676959, 1: 0.8462255358807083, 2: 0.9091751621872104}


In [None]:
# example of analyzing attention for a specific sentence pair
# analyze_attention(checkpoint_folder, "a man is riding a horse.", "a man is riding a bike.", device='cpu')

In [None]:
comparison_table_snli, overall_accuracy_snli, accuracy_per_class_snli = evaluate_model_on_dataset(checkpoint_folder, './set_1.jsonl', split='train', json=True)

Map:   0%|          | 0/6215 [00:00<?, ? examples/s]

  0%|          | 0/777 [00:00<?, ?it/s]

Overall Accuracy: 0.5874
Accuracy per class: {0: 0.8254716981132075, 1: 0.42429682824655895, 2: 0.6387349953831948}


In [None]:
# example of analyzing attention for a specific sentence pair
# analyze_attention(checkpoint_folder, "a man is riding a horse.", "a man is riding a bike.", device='cpu')

In [None]:
comparison_table_snli, overall_accuracy_snli, accuracy_per_class_snli = evaluate_model_on_dataset(checkpoint_folder, './set_2.jsonl', split='train', json=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Overall Accuracy: 0.7896
Accuracy per class: {0: 0.786520190023753, 1: 0.7875116495806151, 2: 0.8378127896200185}


In [None]:
comparison_table_snli, overall_accuracy_snli, accuracy_per_class_snli = evaluate_model_on_dataset(checkpoint_folder, './set_3.jsonl', split='train', json=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Overall Accuracy: 0.8733
Accuracy per class: {0: 0.9103325415676959, 1: 0.8462255358807083, 2: 0.9091751621872104}


In [None]:
 # example of using the generalized functions for snli dataset
comparison_table_snli, overall_accuracy_snli, accuracy_per_class_snli = evaluate_model_on_dataset(checkpoint_folder, './set_4.jsonl', split='train', json=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Overall Accuracy: 0.8692
Accuracy per class: {0: 0.9207221350078493, 1: 0.8096774193548387, 2: 0.8702912102591505}


In [None]:
comparison_table_snli, overall_accuracy_snli, accuracy_per_class_snli = evaluate_model_on_dataset(checkpoint_folder, './set_5.jsonl', split='train', json=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Overall Accuracy: 0.5791
Accuracy per class: {0: 0.09619952494061758, 1: 0.7651444547996272, 2: 0.9280197713932654}


In [None]:
comparison_table_snli, overall_accuracy_snli, accuracy_per_class_snli = evaluate_model_on_dataset(checkpoint_folder, './set_6.jsonl', split='train', json=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
comparison_table_snli, overall_accuracy_snli, accuracy_per_class_snli = evaluate_model_on_dataset(checkpoint_folder, './set_7.jsonl', split='train', json=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Overall Accuracy: 0.8701
Accuracy per class: {0: 0.9290060851926978, 1: 0.8206583427922814, 2: 0.8654644111075647}


In [None]:
dataset = load_dataset('snli', split='test')

set_1 = dataset.select(range(len(dataset)))
set_2 = dataset.select(range(len(dataset)))
set_3 = dataset.select(range(len(dataset)))
set_4 = dataset.select(range(len(dataset)))
set_5 = dataset.select(range(len(dataset)))

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk import pos_tag

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# get bert tokenizer and models
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# load models and move to gpu 
model_mlm = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device) # mlm model for word replacement
model_bert = BertModel.from_pretrained('bert-base-uncased').to(device) # bert model for sentence embeddings

def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model_bert(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy() 

# function to replace a word in a sentence 
def replace_word_with_bert(sentence, word_to_replace):
    masked_sentence = sentence.replace(word_to_replace, '[MASK]', 1)
    inputs = tokenizer(masked_sentence, return_tensors='pt').to(device)
    mask_token_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]

    with torch.no_grad():
        logits = model_mlm(**inputs).logits

    mask_token_logits = logits[0, mask_token_index, :]
    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

# get top replacement word (that is not the original word)
    for token in top_5_tokens:
        replacement_word = tokenizer.decode([token])
        if replacement_word.lower() != word_to_replace.lower():
            return replacement_word
    return word_to_replace

# function to replace a word and calculate similarity
def replace_word_and_get_similarity(sentence, word_to_replace):
    replaced_word = replace_word_with_bert(sentence, word_to_replace)
    replaced_sentence = sentence.replace(word_to_replace, replaced_word, 1)

   # get embeddings for original and replaced sentences
    original_embedding = get_sentence_embedding(sentence)
    replaced_embedding = get_sentence_embedding(replaced_sentence)

   # calculate cosine similarity between original and replaced sentence embeddings
    similarity = cosine_similarity(original_embedding, replaced_embedding)[0][0]
    return similarity, replaced_word, replaced_sentence

# find the word whose replacement least changes the meaning
def find_least_changed_word(sentence):
    words = word_tokenize(sentence)
    pos_tags = pos_tag(words)

   # identify nouns, adjectives in the sentence
    candidates = [word for word, pos in pos_tags if pos in ['NN', 'NNS', 'JJ']]

    best_similarity = -1
    best_word = None
    best_replaced_word = None
    best_replaced_sentence = None

   # for each candidate word, replace it and calculate similarity
    for word in candidates:
        similarity, replaced_word, replaced_sentence = replace_word_and_get_similarity(sentence, word)
        if similarity > best_similarity:
            best_similarity = similarity
            best_word = word
            best_replaced_word = replaced_word
            best_replaced_sentence = replaced_sentence

    return best_word, best_replaced_word, best_replaced_sentence, best_similarity

# apply the least-changed-word replacement to the hypothesis field in a dataset row
def replace_in_dataset(row):
    sentence = row['hypothesis']
    _, _, new_sentence, _ = find_least_changed_word(sentence)

   # replace the hypothesis with the new sentence
    row['hypothesis'] = new_sentence
    return row

# apply the function to each row in the dataset (e.g., replacing in 'hypothesis' field)
set_3 = set_3.map(replace_in_dataset)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
set_3.to_json("set_3.jsonl")

Creating json from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

1505966

In [None]:
set_3

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 10000
})

In [None]:
total = 0
for e in set_3:
    print(e['premise'])
    print(e['hypothesis'])
    print(e['label'])
    total+=1
    if total > 30:
        break

This church choir sings to the masses as they sing joyous songs from the book at a church.
The church has cracks in the walls.
1
This church choir sings to the masses as they sing joyous songs from the book at a church.
The church is filled with people.
0
This church choir sings to the masses as they sing joyous songs from the book at a church.
A choir singing at a football game.
2
A woman with a green headscarf, blue shirt and a very big grin.
The woman is dead.
1
A woman with a green headscarf, blue shirt and a very big grin.
The woman is very beautiful.
0
A woman with a green headscarf, blue shirt and a very big grin.
The film has been shot.
2
An old man with a package poses in front of an advertisement.
A woman poses in front of an ad.
0
An old man with a package poses in front of an advertisement.
A woman poses in front of an ad for beer.
1
An old man with a package poses in front of an advertisement.
A woman walks by an ad.
2
A statue at a museum that no seems to be looking at.
T

In [None]:
set_3

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 10000
})