In [6]:
import pandas as pd
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
tqdm.pandas()

In [7]:
reviews = pd.read_pickle('Pickle/reviews.pkl')

In [8]:
reviews = reviews.sample(500000, random_state=42)

In [9]:
# Initialize your model
model = SentenceTransformer('all-MiniLM-L6-v2')

def save_embeddings_incrementally(reviews_df, model, interval=100):
    embeddings_file = 'Pickle/review_embeddings.pkl'
    
    # Load existing embeddings if they exist
    if os.path.exists(embeddings_file):
        embeddings_df = pd.read_pickle(embeddings_file)
    else:
        embeddings_df = pd.DataFrame(columns=['index', 'review_id', 'embeddings'])
    
    # Ensure combined_features are non-null
    reviews_df = reviews_df.dropna(subset=['review_text']).reset_index(drop=True)

    new_embeddings = []

    for i in tqdm(range(len(reviews_df)), desc="Generating embeddings"):
        if i in embeddings_df['index'].values:
            continue  # Skip if already processed
        
        embedding = model.encode(reviews_df.at[i, 'review_text'])
        new_row = {'index': i, 'review_id': reviews_df.at[i, 'review_id'], 'embeddings': embedding}
        new_embeddings.append(new_row)
        
        # Save periodically
        if len(new_embeddings) % interval == 0:
            new_embeddings_df = pd.DataFrame(new_embeddings)
            embeddings_df = pd.concat([embeddings_df, new_embeddings_df], ignore_index=True)
            embeddings_df.to_pickle(embeddings_file)
            new_embeddings = []  # Reset the list
    
    # Save any remaining new embeddings
    if new_embeddings:
        new_embeddings_df = pd.DataFrame(new_embeddings)
        embeddings_df = pd.concat([embeddings_df, new_embeddings_df], ignore_index=True)
        embeddings_df.to_pickle(embeddings_file)

    print(f"Embeddings saved to {embeddings_file} successfully!")




In [10]:
# Save embeddings incrementally
save_embeddings_incrementally(reviews, model, interval=100)

Generating embeddings: 100%|██████████| 500000/500000 [7:02:16<00:00, 19.73it/s]    

Embeddings saved to Pickle/review_embeddings.pkl successfully!





In [None]:
embeddings_df = pd.read_pickle('Pickle/review_embeddings.pkl')

In [None]:
embeddings_df

In [None]:
embeddings_df.set_index('review_id', inplace=True)

def get_embedding(review_id):
    try:
        return embeddings_df.at[review_id, 'embeddings']
    except KeyError:
        return None

reviews['embeddings'] = reviews['review_id'].progress_apply(get_embedding)


In [None]:
#reviews.to_pickle('Pickle/reviews.pkl')