In [2]:
import pandas as pd
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
tqdm.pandas()

  from tqdm.autonotebook import tqdm, trange


In [3]:
reviews = pd.read_pickle('../Pickle/reviews.pkl')

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def save_embeddings_incrementally(reviews_df, model, interval=100):
    embeddings_file = '../Pickle/review_embeddings.pkl'
    
    if os.path.exists(embeddings_file):
        embeddings_df = pd.read_pickle(embeddings_file)
    else:
        embeddings_df = pd.DataFrame(columns=['index', 'review_id', 'embeddings'])
    
    reviews_df = reviews_df.dropna(subset=['review_text']).reset_index(drop=True)

    new_embeddings = []

    for i in tqdm(range(len(reviews_df)), desc="Generating embeddings"):
        if i in embeddings_df['index'].values:
            continue  # Skip if already processed
        
        embedding = model.encode(reviews_df.at[i, 'review_text'])
        new_row = {'index': i, 'review_id': reviews_df.at[i, 'review_id'], 'embeddings': embedding}
        new_embeddings.append(new_row)
        
        # Save periodically
        if len(new_embeddings) % interval == 0:
            new_embeddings_df = pd.DataFrame(new_embeddings)
            embeddings_df = pd.concat([embeddings_df, new_embeddings_df], ignore_index=True)
            embeddings_df.to_pickle(embeddings_file)
            new_embeddings = []
    
    # Save any remaining 
    if new_embeddings:
        new_embeddings_df = pd.DataFrame(new_embeddings)
        embeddings_df = pd.concat([embeddings_df, new_embeddings_df], ignore_index=True)
        embeddings_df.to_pickle(embeddings_file)

    print(f"Embeddings saved {embeddings_file}")


In [5]:
save_embeddings_incrementally(reviews, model, interval=10000)

Generating embeddings: 100%|██████████| 1001000/1001000 [22:59:22<00:00, 12.09it/s]   

Embeddings saved ../Pickle/review_embeddings.pkl



