In [None]:
import csv
import torch
import numpy as np
from transformers import BertTokenizer, BertModel

file_path = 'items.csv'

with open(file_path, 'r', encoding='utf-8') as file:
    data_reader = csv.reader(file)
    data = [row for row in data_reader]

ind1 = [5, 29, 30, 31, 32]
moviedesc = []
for i in range(1, len(data)):
    tmp = ""
    for j in ind1:
        tmp += data[i][j]
        tmp += " "
    # genres
    for j in range(10, 28):
        if data[i][j] == '1':
            tmp += data[0][j]
            tmp += " "
    moviedesc.append(tmp)

print("len(moviedesc): ", len(moviedesc))
print(moviedesc)

# Load pre-trained BERT model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize the input text and convert it to tensor
input_ids = tokenizer(moviedesc, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**input_ids)

# print("output shape = ", outputs.shape)
# Only take the output embeddings from the last layer
last_hidden_states = outputs.last_hidden_state
print("last_hidden_states shape= ", last_hidden_states)

# For sentence embeddings, you can use the [CLS] token representation
sentence_embeddings = last_hidden_states[:, 0, :]

print("BERT Sentence Embedding Shape:", sentence_embeddings.shape)
print(sentence_embeddings)

# # Save embeddings
# numpy_array = sentence_embeddings.detach().numpy()
# print("numpy_array shape = ", len(numpy_array))
# np.save('text_feat.npy', numpy_array)
# print("Saved text embeddings in text_feat.npy")

resized_text_embeds = []


# resizing text embeddings to 384 
for t in sentence_embeddings:
    resized_t = torch.nn.functional.interpolate(t.unsqueeze(0).unsqueeze(0), size = (384,), mode = 'linear').squeeze()
    resized_text_embeds.append(resized_t)

print("len(resized_text_embeds): ", len(resized_text_embeds))
# saving text embeddings in text_feat.npy
numpy_array = [tensor.detach().numpy() for tensor in resized_text_embeds]
print("len(numpy_array): ", len(numpy_array))
stacked_resized_text_embeds = np.stack(numpy_array, axis=0)
print("stacked_resized_text_embeds.shape: ", stacked_resized_text_embeds.shape)
np.save('data/ml100k/text_feat.npy',stacked_resized_text_embeds)
print("saved text embeddings in text_feat.npy")