In [1]:
import torch
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
from sklearn.cluster import KMeans
import pandas as pd
from tqdm import tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

def get_bert_embeddings(sentences, batch_size=32):
    model.eval()
    embeddings = []
    progress_bar = tqdm(range(0, len(sentences), batch_size), desc="Generating Embeddings")

    with torch.no_grad():
        for i in progress_bar:
            batch_sentences = sentences[i:i+batch_size]
            inputs = tokenizer(batch_sentences, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :]  # Use the embedding of the [CLS] token
            embeddings.append(batch_embeddings.cpu())  # Move embeddings back to CPU if needed

    return torch.cat(embeddings, dim=0)



In [3]:
from utils import count_empty_strings
from utils import count_nan_values

df_reviews = pd.read_csv('data/reviews_sampled_processed.csv')
#df_reviews = df_reviews.dropna(subset=['reviewText'])
#df_reviews = df_reviews.dropna(subset=['summary'])
df_products = pd.read_csv('data/products_sampled_processed.csv')
#df_products = df_products.dropna(subset=['description'])

print('Nan values per feature: \n', count_nan_values(df_reviews))
print('Empty values per feature: \n', count_empty_strings(df_reviews))

#print('Nan values per feature: \n', count_nan_values(df_products))
#print('Empty values per feature: \n', count_empty_strings(df_products))

Nan values per feature: 
 Series([], dtype: int64)
Empty values per feature: 
 Series([], dtype: int64)


In [4]:
reviews = df_reviews['reviewText'].tolist()
summaries = df_reviews['summary'].tolist()
descriptions = df_products['description'].tolist()
titles = df_products['title'].tolist()
features = df_products['feature'].tolist()

#check for nan and empty strings
#check_nan_and_empty_string(reviews)
#check_nan_and_empty_string(summaries)
#check_nan_and_empty_string(descriptions)

rev_embeddings = get_bert_embeddings(reviews)
summ_embeddings = get_bert_embeddings(summaries)
desc_embeddings = get_bert_embeddings(descriptions)
titles_embeddings = get_bert_embeddings(titles)
features_embeddings = get_bert_embeddings(features)


#save the embeddings
torch.save(rev_embeddings, 'data/review_embeddings.pt')
torch.save(summ_embeddings, 'data/summary_embeddings.pt')
torch.save(desc_embeddings, 'data/description_embeddings.pt')
torch.save(titles_embeddings, 'data/title_embeddings.pt')
torch.save(features_embeddings, 'data/feature_embeddings.pt')

#load the embeddings
rev_embeddings = torch.load('data/review_embeddings.pt')
summ_embeddings = torch.load('data/summary_embeddings.pt')
desc_embeddings = torch.load('data/description_embeddings.pt')
titles_embeddings = torch.load('data/title_embeddings.pt')
features_embeddings = torch.load('data/feature_embeddings.pt')

Generating Embeddings:   0%|          | 0/1424 [00:03<?, ?it/s]


KeyboardInterrupt: 