In [2]:
import json

sarcasm_data = json.load(open("sarcasm_data.json"))


In [3]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
model_name = 'bert-large-uncased'  # Can use other models like 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Set the model to evaluation mode (to prevent weight updates)
model.eval()

def extract_features(input_text):
    with torch.no_grad():
        inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    cls_embeddings = last_hidden_states[:, 0, :][0].tolist()
    return cls_embeddings

result = {}
for key, data in sarcasm_data.items():
    result[key] = extract_features(data['utterance'])

with open("text_features_bert.json", "w") as f:
    json.dump(result, f)

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch
import json

# Load BGE model and tokenizer
model_name = 'BAAI/bge-large-en-v1.5'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Set the model to evaluation mode
model.eval()

def extract_features(input_text):
    with torch.no_grad():
        # Add the instruction prompt for better performance
        prompted_text = f"Represent this text for retrieval: {input_text}"
        
        # Tokenize and encode
        inputs = tokenizer(
            prompted_text,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)
        
        # Get model outputs
        outputs = model(**inputs)
        
        # Get embeddings from the last hidden state
        embeddings = outputs.last_hidden_state[:, 0]  # CLS token
        
        # Normalize embeddings
        normalized_embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        
        # Convert to list and move to CPU
        return normalized_embeddings[0].cpu().tolist()

result = {}
for key, data in sarcasm_data.items():
    result[key] = extract_features(data['utterance'])

with open("text_features_bge.json", "w") as f:
    json.dump(result, f)

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]