In [1]:

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer, DebertaModel, DebertaTokenizer

# Load the corpus
df = pd.read_csv("Consp-vs-Critical.csv", delimiter="\t")

# Tokenizers and pre-trained models
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')

deberta_tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
deberta_model = DebertaModel.from_pretrained('microsoft/deberta-base')

# Function to get the CLS representation of a document for a given model
def get_cls_vector(document, tokenizer, model):
    inputs = tokenizer(document, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    cls_vector = outputs[0][:, 0, :]  # Get the CLS vector
    return cls_vector

# Function to calculate cosine similarity between two vectors
def calculate_similarity(vector1, vector2):
    return cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]

# Function to find the most similar pair for each document in a class
def find_most_similar_pair(df, tokenizer, model, class_label):
    embeddings = []
    for text in df[df['label'] == class_label]['text']:
        cls_vector = get_cls_vector(text, tokenizer, model)
        embeddings.append(cls_vector)
    
    max_similarity = -1
    most_similar_pair = ()
    for i in range(len(embeddings)):
        for j in range(len(embeddings)):
            if i != j:  # Avoid comparing a document with itself
                similarity = calculate_similarity(embeddings[i], embeddings[j])
                if similarity > max_similarity:
                    max_similarity = similarity
                    most_similar_pair = (df[df['label'] == class_label].index[i],
                                         df[df['label'] == class_label].index[j],
                                         max_similarity)
    return most_similar_pair

# Find the most similar pair for each class ('CONSPIRACY' and 'CRITICAL') in each model
bert_conspiracy_pair = find_most_similar_pair(df, bert_tokenizer, bert_model, 'CONSPIRACY')
bert_critical_pair = find_most_similar_pair(df, bert_tokenizer, bert_model, 'CRITICAL')

roberta_conspiracy_pair = find_most_similar_pair(df, roberta_tokenizer, roberta_model, 'CONSPIRACY')
roberta_critical_pair = find_most_similar_pair(df, roberta_tokenizer, roberta_model, 'CRITICAL')

deberta_conspiracy_pair = find_most_similar_pair(df, deberta_tokenizer, deberta_model, 'CONSPIRACY')
deberta_critical_pair = find_most_similar_pair(df, deberta_tokenizer, deberta_model, 'CRITICAL')

# Show the results
print("Most similar pair for 'CONSPIRACY' using BERT:", bert_conspiracy_pair)
print("Most similar pair for 'CRITICAL' using BERT:", bert_critical_pair)
print("Most similar pair for 'CONSPIRACY' using RoBERTa:", roberta_conspiracy_pair)
print("Most similar pair for 'CRITICAL' using RoBERTa:", roberta_critical_pair)
print("Most similar pair for 'CONSPIRACY' using DeBERTa:", deberta_conspiracy_pair)
print("Most similar pair for 'CRITICAL' using DeBERTa:", deberta_critical_pair)

ModuleNotFoundError: No module named 'torch'