In [None]:
from transformers import RobertaTokenizer, RobertaModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Step 1: Load and preprocess text
file_path = '<path_to_file>/Kiwi_VIS_Corpus.txt'

with open(file_path, 'r') as file:
    text = file.read()
    
# Clean text and standardize
text = text.replace('-', '').replace('_', '').lower()

In [None]:
print(text)

In [None]:
# Step 2: Load RoBERTa model or BERT model
# You can choose either 'bert-base-uncased' or 'roberta-large'
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaModel.from_pretrained('roberta-large')

In [None]:
# Step 3: Tokenize text with proper handling
inputs = tokenizer(text, return_tensors='pt', 
                  truncation=True, 
                  padding=True,
                  max_length=512)  # Ensure consistent length

# Convert to tokens for verification
input_ids = inputs['input_ids'][0]
tokens = tokenizer.convert_ids_to_tokens(input_ids)

# Print first 200 tokens to verify "perfect" exists
print("Sample tokens:", tokens[:200])

In [None]:
# Step 4: Get embeddings from last hidden state
with torch.no_grad():
    outputs = model(**inputs)
embeddings = outputs.last_hidden_state  # Shape: [1, seq_len, 1024]

In [None]:
# Step 5: Improved embedding extraction with fallback
words = ["overripe", "ripe", "unripe"]
embeddings_dict = {}

for word in words:
    # Tokenize target word (handle subwords)
    word_tokens = tokenizer.tokenize(word)
    word_ids = tokenizer.convert_tokens_to_ids(word_tokens)
    
    # Find positions in input_ids
    indices = []
    for i in range(len(input_ids) - len(word_ids) + 1):
        if all(input_ids[i+j] == word_ids[j] for j in range(len(word_ids))):
            indices.extend(range(i, i+len(word_ids)))
    
    if indices:
        # Average embeddings for multi-token words
        word_embedding = embeddings[0, indices, :].mean(dim=0)
    else:
        # Fallback: embed word in isolation
        print(f"Word '{word}' not found in text - using standalone embedding")
        word_inputs = tokenizer(word, return_tensors='pt')
        with torch.no_grad():
            word_outputs = model(**word_inputs)
        word_embedding = word_outputs.last_hidden_state.mean(dim=1).squeeze()
    
    embeddings_dict[word] = word_embedding

# %%
# Verification and analysis
print("\nEmbedding shapes:")
for word, emb in embeddings_dict.items():
    print(f"{word}: {emb.shape}")

In [None]:
# Calculate similarity matrix
similarities = cosine_similarity(
    [embeddings_dict["overripe"].numpy(), 
     embeddings_dict["perfect"].numpy(),
     embeddings_dict["unripe"].numpy()]
)

In [None]:
print("\nCosine similarity matrix:")
print("       overripe  perfect  unripe")
for i, row in enumerate(["overripe", "perfect", "unripe"]):
    print(f"{row:8} {similarities[i][0]:.3f}    {similarities[i][1]:.3f}    {similarities[i][2]:.3f}")


In [None]:
# Save embeddings
np.save("Avocado_VIS_Embeddings_Roberta.npy", 
        {k: v.numpy() for k, v in embeddings_dict.items()})