In [None]:
import pandas as pd
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import pickle
from lingua import Language, LanguageDetectorBuilder
tqdm.pandas()


  from tqdm.autonotebook import tqdm, trange


In [None]:
books_list = []

with open('../Pickle/books.pkl', 'rb') as file:
    while True:
        try:
            chunk = pickle.load(file)
            books_list.append(chunk)
        except EOFError:
            break
books = pd.concat(books_list, ignore_index=True)


In [None]:
books = books[books['filtered_genres'].apply(lambda x: bool(x))]

In [None]:
include_language_codes = ['', ' ', 'eng', 'en-US', 'en-GB', '--', 'en-CA', 'en-IN']
books = books[books['language_code'].isin(include_language_codes)]


In [None]:
detector = LanguageDetectorBuilder.from_all_languages().build()
def detect_language(text):
    """
    Detects whether the input text is in English using the LanguageDetector.
    
    Parameters:
    - text (str): The text to detect language for.
    
    Returns:
    - bool: True if the text is in English, False otherwise.
    """
    if not text or pd.isna(text):
        return False
    text = text[:250]

    try:
        detected_lang = detector.detect_language_of(text)
        return detected_lang == Language.ENGLISH
    except Exception:
        return False


chunk_size = 5000
save_every = 10
output_pickle = "../Pickle/books_filtered.pkl"

if os.path.exists(output_pickle):
    books_filtered = pd.read_pickle(output_pickle)
    processed_books = set(books_filtered["book_id"])
else:
    books_filtered = pd.DataFrame()
    processed_books = set()

books_to_process = books[~books["book_id"].isin(processed_books)]

if books_to_process.empty:
    print("all books processed")
else:
    print(f"Processing {len(books_to_process)}")

    buffer = []
    for i, start in enumerate(tqdm(range(0, len(books_to_process), chunk_size), desc="processing")):
        end = min(start + chunk_size, len(books_to_process))
        books_chunk = books_to_process.iloc[start:end].copy()
        books_chunk["is_english"] = books_chunk["description"].progress_apply(detect_language)
        books_chunk = books_chunk[books_chunk["is_english"]].drop(columns=["is_english"])
        buffer.append(books_chunk)

        if (i + 1) % save_every == 0 or (i + 1) == len(range(0, len(books_to_process), chunk_size)):
            buffer_df = pd.concat(buffer, ignore_index=True)
            books_filtered = pd.concat([books_filtered, buffer_df], ignore_index=True)
            books_filtered.to_pickle(output_pickle)
            buffer = []


In [None]:
eng_books = pd.read_pickle("../Pickle/books_filtered.pkl")

In [5]:
books = eng_books

In [None]:
books['combined_features'] = books.apply(
    lambda row: f"{row['title']} by {row['authors']}, " +
                f"Description: {row['description']}, " +
                f"Shelves: {row['expanded_shelves']}" +
                f"Genres: {row['filtered_genres']}",
    axis=1
)

In [None]:
books = books.reset_index(drop=True)

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def save_embeddings_incrementally(books_df, model, interval=100):
    """
    Generates and saves sentence embeddings for book descriptions incrementally.
    
    Parameters:
    - books_df (pd.DataFrame): DataFrame containing book_id and combined_features columns.
    - model (SentenceTransformer): Preloaded SentenceTransformer model used for encoding.
    - interval (int): Number of new embeddings after which to save progress.
    
    Returns:
    - None. Saves embeddings incrementally to a pickle file.
    """
    embeddings_file = '../Pickle/embeddings.pkl'
    
    if os.path.exists(embeddings_file):
        embeddings_df = pd.read_pickle(embeddings_file)
    else:
        embeddings_df = pd.DataFrame(columns=['book_id', 'embeddings'])
    
    books_df = books_df.dropna(subset=['combined_features']).reset_index(drop=True)
    new_embeddings = []

    for i in tqdm(range(len(books_df)), desc="processing"):
        book_id = books_df.at[i, 'book_id']
        if book_id in embeddings_df['book_id'].values:
            continue
        
        embedding = model.encode(books_df.at[i, 'combined_features'])
        new_row = {'book_id': book_id, 'embeddings': embedding}
        new_embeddings.append(new_row)
        
        if len(new_embeddings) % interval == 0:
            new_embeddings_df = pd.DataFrame(new_embeddings)
            embeddings_df = pd.concat([embeddings_df, new_embeddings_df], ignore_index=True)
            embeddings_df.to_pickle(embeddings_file)
            new_embeddings = []
    
    if new_embeddings:
        new_embeddings_df = pd.DataFrame(new_embeddings)
        embeddings_df = pd.concat([embeddings_df, new_embeddings_df], ignore_index=True)
        embeddings_df.to_pickle(embeddings_file)


In [None]:
save_embeddings_incrementally(books, model, interval=100)

In [6]:
embeddings_df = pd.read_pickle('../Pickle/embeddings.pkl')

In [8]:
embeddings_df=embeddings_df.drop(columns=['index'])

In [None]:
embeddings_df.set_index('book_id', inplace=True)

def get_embedding(book_id):
    try:
        return embeddings_df.at[book_id, 'embeddings']
    except KeyError:
        return None

books['embeddings'] = books['book_id'].progress_apply(get_embedding)


100%|██████████| 577082/577082 [00:02<00:00, 210510.98it/s]


In [None]:
chunk_size = 10000
num_chunks = len(books) // chunk_size + 1
progress_bar = tqdm(total=len(books))
with open('../Pickle/books.pkl', 'wb') as file:
    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = (i + 1) * chunk_size
        chunk = books.iloc[start_idx:end_idx]
        for _, row in chunk.iterrows():
            progress_bar.update(1)
        if i == 0:
            pickle.dump(chunk, file)
        else:
            pickle.dump(chunk, file)
progress_bar.close()

100%|██████████| 577082/577082 [03:29<00:00, 2760.12it/s] 
