In [None]:
from transformers import RobertaTokenizer, RobertaModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Step 1: Load the text from a .txt file
file_path = 'Blueberry_Corpus.txt'

with open(file_path, 'r') as file:
    text = file.read()
    
text = text.replace('-', '').replace('_', '').lower()

In [None]:
print(text)

In [None]:
# Step 2: Load pre-trained RoBERTa or bert tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaModel.from_pretrained('roberta-large')

In [None]:
print(tokenizer.tokenize("normal blueberries")) 

In [None]:
# Step 3: Tokenize the text
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

#see tokens:
input_ids = inputs['input_ids'][0]
tokens = tokenizer.convert_ids_to_tokens(input_ids)

print("Tokens:", tokens)

In [None]:
# Step 4: Extract embeddings
# Get the tokenized input IDs and attention mask
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

# Run the inputs through the model
with torch.no_grad():
    
    outputs = model(input_ids, attention_mask=attention_mask)
    
    # Get the last hidden states (embeddings for each token)
    embeddings = outputs.last_hidden_state

In [None]:
# List of words to extract embeddings for
words = ["normal blueberries", "defective blueberries"]

# Tokenize the individual words
words_ids = {word: tokenizer.encode(word, add_special_tokens=False) for word in words}

# Initialize a dictionary to store embeddings
embeddings_dict = {}

# Loop through each word in the list
for word, word_id in words_ids.items():
    
    # Find the indices of this word in the tokenized input
    indices = [i for i, id_ in enumerate(input_ids[0]) if id_ in word_id]
    
    # Extract embeddings for the specific word
    embedding = embeddings[0, indices, :].mean(dim=0) if indices else None
    
    # Store the embedding in the dictionary
    embeddings_dict[word] = embedding

# Output the embeddings
for word, embedding in embeddings_dict.items():
    
    print(f"Embedding for {word}: {embedding}")
    
normal_blueberry_embedding = embeddings_dict["normal blueberries"]
defective_blueberry_embedding = embeddings_dict["defective blueberries"]

In [None]:
print(normal_blueberry_embedding.shape, defective_blueberry_embedding.shape)

similarity = cosine_similarity(normal_blueberry_embedding.unsqueeze(0), defective_blueberry_embedding.unsqueeze(0))
print("Cosine Similarity:", similarity)

In [None]:
# Save the embeddings to a .npy file
np.save("Blueberry_Embeddings_Roberta.npy", embeddings_dict)

In [None]:
#load the embeddings back to verify
loaded_embeddings = np.load("Blueberry_Embeddings_Roberta.npy", allow_pickle=True).item()

#print(loaded_embeddings.shape)
print(loaded_embeddings)