In [None]:
import pandas as pd
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
tqdm.pandas()

In [None]:
reviews = pd.read_pickle('../Pickle/reviews.pkl')

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def save_embeddings_incrementally(reviews_df, model, interval=100):
    """
    Computes and saves sentence embeddings for reviews incrementally. Embeddings are calculated for reviews 
    in the given DataFrame and stored in a pickle file.

    If the embeddings file already exists, the function reads the existing embeddings and appends new ones.
    The process is done in batches, and the embeddings are saved periodically based on the specified interval.

    Parameters:
    - reviews_df (pd.DataFrame): DataFrame containing the reviews with columns 'review_id' and 'review_text'.
    - model (SentenceTransformer): A pre-trained SentenceTransformer model to encode the review texts.
    - interval (int): The number of embeddings processed before saving to the file. Default is 100.

    Returns:
    - None: The function modifies the pickle file that stores the embeddings.
    """
    embeddings_file = '../Pickle/review_embeddings.pkl'
    
    if os.path.exists(embeddings_file):
        embeddings_df = pd.read_pickle(embeddings_file)
    else:
        embeddings_df = pd.DataFrame(columns=['index', 'review_id', 'embeddings'])
    
    reviews_df = reviews_df.dropna(subset=['review_text']).reset_index(drop=True)

    new_embeddings = []

    for i in tqdm(range(len(reviews_df)), desc="processing"):
        if i in embeddings_df['index'].values:
            continue  
        
        embedding = model.encode(reviews_df.at[i, 'review_text'])
        new_row = {'index': i, 'review_id': reviews_df.at[i, 'review_id'], 'embeddings': embedding}
        new_embeddings.append(new_row)
        
        if len(new_embeddings) % interval == 0:
            new_embeddings_df = pd.DataFrame(new_embeddings)
            embeddings_df = pd.concat([embeddings_df, new_embeddings_df], ignore_index=True)
            embeddings_df.to_pickle(embeddings_file)
            new_embeddings = []
    
    if new_embeddings:
        new_embeddings_df = pd.DataFrame(new_embeddings)
        embeddings_df = pd.concat([embeddings_df, new_embeddings_df], ignore_index=True)
        embeddings_df.to_pickle(embeddings_file)

In [None]:
save_embeddings_incrementally(reviews, model, interval=10000)