In [1]:
import pandas as pd
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

books = pd.read_pickle('Pickle/books.pkl')

  from tqdm.autonotebook import tqdm, trange


In [7]:
len(books)

363312

In [3]:
books['combined_features'] = books.apply(
    lambda row: f"{row['title']} by {row['authors']}, " +
                f"Description: {row['description']}, " +
                f"Shelves: {row['expanded_shelves']}" +
                f"Genres: {row['filtered_genres']}",
    axis=1
)

In [4]:
books.head(1)

Unnamed: 0,description,authors,book_id,title,url,average_rating,expanded_shelves,filtered_genres,combined_features
0,The war against Voldemort is not going well: e...,"[J.K. Rowling, Mary GrandPre]",1,Harry Potter and the Half-Blood Prince (Harry ...,https://www.goodreads.com/book/show/1.Harry_Po...,4.54,fantasy fantasy fantasy fantasy fantasy fantas...,"fantasy, paranormal, young-adult, fiction, chi...",Harry Potter and the Half-Blood Prince (Harry ...


In [5]:
books = books.reset_index(drop=True)

In [8]:
# Initialize your model
model = SentenceTransformer('all-MiniLM-L6-v2')

def save_embeddings_incrementally(books_df, model, interval=100):
    embeddings_file = 'Pickle/embeddings.pkl'
    
    # Load existing embeddings if they exist
    if os.path.exists(embeddings_file):
        embeddings_df = pd.read_pickle(embeddings_file)
    else:
        embeddings_df = pd.DataFrame(columns=['index', 'book_id', 'embeddings'])
    
    # Ensure combined_features are non-null
    books_df = books_df.dropna(subset=['combined_features']).reset_index(drop=True)

    new_embeddings = []

    for i in tqdm(range(len(books_df)), desc="Generating embeddings"):
        if i in embeddings_df['index'].values:
            continue  # Skip if already processed
        
        embedding = model.encode(books_df.at[i, 'combined_features'])
        new_row = {'index': i, 'book_id': books_df.at[i, 'book_id'], 'embeddings': embedding}
        new_embeddings.append(new_row)
        
        # Save periodically
        if len(new_embeddings) % interval == 0:
            new_embeddings_df = pd.DataFrame(new_embeddings)
            embeddings_df = pd.concat([embeddings_df, new_embeddings_df], ignore_index=True)
            embeddings_df.to_pickle(embeddings_file)
            new_embeddings = []  # Reset the list
    
    # Save any remaining new embeddings
    if new_embeddings:
        new_embeddings_df = pd.DataFrame(new_embeddings)
        embeddings_df = pd.concat([embeddings_df, new_embeddings_df], ignore_index=True)
        embeddings_df.to_pickle(embeddings_file)

    print(f"Embeddings saved to {embeddings_file} successfully!")




In [None]:
save_embeddings_incrementally(books, model, interval=100)

Generating embeddings:  58%|█████▊    | 210599/363312 [4:13:47<2:43:49, 15.54it/s]  

In [60]:
embeddings_df = pd.read_pickle('Pickle/embeddings.pkl')

In [61]:
embeddings_df=embeddings_df.drop(columns=['index'])

In [None]:
tqdm.pandas()
embeddings_df.set_index('book_id', inplace=True)

def get_embedding(book_id):
    try:
        return embeddings_df.at[book_id, 'embeddings']
    except KeyError:
        return None

books['embeddings'] = books['book_id'].progress_apply(get_embedding)


Merging embeddings into the 'books' DataFrame...


  0%|          | 0/120421 [00:00<?, ?it/s]

100%|██████████| 120421/120421 [00:01<00:00, 94554.65it/s] 


In [66]:
books.to_pickle('Pickle/books.pkl')