In [1]:
import pandas as pd
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
import pickle
import pandas as pd

# Load the pickled chunks and concatenate them
books_list = []

with open('../Pickle/books.pkl', 'rb') as file:
    while True:
        try:
            chunk = pickle.load(file)
            books_list.append(chunk)
        except EOFError:
            break  # Stop when end of file is reached

# Combine chunks into a single DataFrame
books = pd.concat(books_list, ignore_index=True)


In [3]:
books.head()

Unnamed: 0,language_code,description,authors,book_id,title,average_rating,title_without_series,filtered_genres,expanded_shelves,embeddings
0,eng,The war against Voldemort is not going well: e...,"[J.K. Rowling, Mary GrandPre]",1,Harry Potter and the Half-Blood Prince (Harry ...,4.54,Harry Potter and the Half-Blood Prince (Harry ...,"fantasy, paranormal, young-adult, fiction, chi...",fantasy fantasy fantasy fantasy fantasy fantas...,"[-0.07355614, -0.0043452834, 0.07326843, 0.010..."
1,en-US,"Bill Bryson's first travel book, The Lost Cont...",[Bill Bryson],27,Neither Here nor There: Travels in Europe,3.88,Neither Here nor There: Travels in Europe,"non-fiction, history, historical fiction, biog...",travel travel travel travel travel travel trav...,"[0.075617716, 0.049203802, -0.08219313, 0.0366..."
2,eng,"Gerald Samper, an effete English snob, has his...",[James Hamilton-Paterson],40,"Cooking with Fernet Branca (Gerald Samper, #1)",3.61,"Cooking with Fernet Branca (Gerald Samper, #1)",fiction,fiction fiction fiction fiction fiction fictio...,"[0.012406698, -0.08666229, -0.041537926, 0.068..."
3,eng,"Rails is a full-stack, open source web framewo...","[Dave Thomas, David Heinemeier Hansson, Leon B...",45,Agile Web Development with Rails: A Pragmatic ...,3.9,Agile Web Development with Rails: A Pragmatic ...,non-fiction,rails rails rails rails rails rails rails rail...,"[0.014417239, -0.079731256, -0.09433866, 0.043..."
4,,Wharton's final novel (completed by Marion Mai...,"[Edith Wharton, Marion Mainwaring]",48,The Bucaneers,3.89,The Bucaneers,"fiction, history, historical fiction, biograph...",classics classics classics classics classics c...,"[0.0069107036, -0.1147475, -0.013718575, 0.003..."


In [None]:
books = books[books['filtered_genres'].apply(lambda x: bool(x))]

In [None]:
include_language_codes = ['', ' ', 'eng', 'en-US', 'en-GB', '--', 'en-CA', 'en-IN']
books = books[books['language_code'].isin(include_language_codes)]


In [None]:
import pandas as pd
from lingua import Language, LanguageDetectorBuilder
from tqdm import tqdm
import os

# Initialize tqdm for Pandas apply
tqdm.pandas()

# Initialize the language detector
detector = LanguageDetectorBuilder.from_all_languages().build()

# Function to detect language
def detect_language(text):
    if not text or pd.isna(text):  # Handle empty or NaN values
        return False
    text = text[:250]  # Limit text length for faster processing

    try:
        detected_lang = detector.detect_language_of(text)
        return detected_lang == Language.ENGLISH  # Direct Enum comparison
    except Exception:
        return False  # Assume non-English in case of an error

# Define chunk size and output file
chunk_size = 5000
save_every = 10  # Save to pickle every 10 chunks
output_pickle = "../Pickle/books_filtered.pkl"

# Load existing processed books if the file exists
if os.path.exists(output_pickle):
    books_filtered = pd.read_pickle(output_pickle)
    processed_books = set(books_filtered["book_id"])  # Keep track of already processed book IDs
else:
    books_filtered = pd.DataFrame()  # Empty DataFrame to start with
    processed_books = set()  # No books processed yet

# Process only unprocessed books
books_to_process = books[~books["book_id"].isin(processed_books)]

if books_to_process.empty:
    print("All books are already processed. No new data to process.")
else:
    print(f"Processing {len(books_to_process)} new books...")

    buffer = []
    for i, start in enumerate(tqdm(range(0, len(books_to_process), chunk_size), desc="Processing in chunks")):
        end = min(start + chunk_size, len(books_to_process))
        books_chunk = books_to_process.iloc[start:end].copy()  # Ensure it's a proper DataFrame

        # Apply language detection
        books_chunk["is_english"] = books_chunk["description"].progress_apply(detect_language)

        # Filter for English descriptions
        books_chunk = books_chunk[books_chunk["is_english"]].drop(columns=["is_english"])

        # Add to buffer
        buffer.append(books_chunk)

        # Every `save_every` chunks, concatenate and save
        if (i + 1) % save_every == 0 or (i + 1) == len(range(0, len(books_to_process), chunk_size)): 
            buffer_df = pd.concat(buffer, ignore_index=True)
            books_filtered = pd.concat([books_filtered, buffer_df], ignore_index=True)
            books_filtered.to_pickle(output_pickle)  # Save to pickle
            buffer = []  # Clear buffer

In [None]:
eng_books = pd.read_pickle("../Pickle/books_filtered.pkl")

In [5]:
books = eng_books

In [None]:
books['combined_features'] = books.apply(
    lambda row: f"{row['title']} by {row['authors']}, " +
                f"Description: {row['description']}, " +
                f"Shelves: {row['expanded_shelves']}" +
                f"Genres: {row['filtered_genres']}",
    axis=1
)

In [None]:
books = books.reset_index(drop=True)

In [None]:
import os
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# Initialize your model
model = SentenceTransformer('all-MiniLM-L6-v2')

def save_embeddings_incrementally(books_df, model, interval=100):
    embeddings_file = '../Pickle/embeddings.pkl'
    
    # Load existing embeddings if they exist
    if os.path.exists(embeddings_file):
        embeddings_df = pd.read_pickle(embeddings_file)
    else:
        embeddings_df = pd.DataFrame(columns=['book_id', 'embeddings'])
    
    # Ensure combined_features are non-null
    books_df = books_df.dropna(subset=['combined_features']).reset_index(drop=True)

    new_embeddings = []

    for i in tqdm(range(len(books_df)), desc="Generating embeddings"):
        book_id = books_df.at[i, 'book_id']
        
        # Skip if the book_id is already processed
        if book_id in embeddings_df['book_id'].values:
            continue
        
        # Generate embedding
        embedding = model.encode(books_df.at[i, 'combined_features'])
        
        # Prepare new row
        new_row = {'book_id': book_id, 'embeddings': embedding}
        new_embeddings.append(new_row)
        
        # Save periodically
        if len(new_embeddings) % interval == 0:
            new_embeddings_df = pd.DataFrame(new_embeddings)
            embeddings_df = pd.concat([embeddings_df, new_embeddings_df], ignore_index=True)
            embeddings_df.to_pickle(embeddings_file)
            new_embeddings = []  # Reset the list
    
    # Save any remaining new embeddings
    if new_embeddings:
        new_embeddings_df = pd.DataFrame(new_embeddings)
        embeddings_df = pd.concat([embeddings_df, new_embeddings_df], ignore_index=True)
        embeddings_df.to_pickle(embeddings_file)

    print(f"Embeddings saved to {embeddings_file} successfully!")



In [None]:
save_embeddings_incrementally(books, model, interval=100)

In [6]:
embeddings_df = pd.read_pickle('../Pickle/embeddings.pkl')

In [8]:
embeddings_df=embeddings_df.drop(columns=['index'])

In [9]:
tqdm.pandas()
embeddings_df.set_index('book_id', inplace=True)

def get_embedding(book_id):
    try:
        return embeddings_df.at[book_id, 'embeddings']
    except KeyError:
        return None

books['embeddings'] = books['book_id'].progress_apply(get_embedding)


100%|██████████| 577082/577082 [00:02<00:00, 210510.98it/s]


In [19]:
import pandas as pd
import pickle
from tqdm import tqdm

# Ensure the progress_apply method from tqdm is used
tqdm.pandas()

# Split the DataFrame into chunks and save each chunk to a pickle file with a progress bar
chunk_size = 10000  # Adjust the chunk size as needed
num_chunks = len(books) // chunk_size + 1

# Initialize the progress bar for rows
progress_bar = tqdm(total=len(books))

# Open the pickle file once before the loop
with open('../Pickle/books.pkl', 'wb') as file:
    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = (i + 1) * chunk_size
        chunk = books.iloc[start_idx:end_idx]
        
        # Update progress bar for each row
        for _, row in chunk.iterrows():
            progress_bar.update(1)
        
        # Append each chunk to the pickle file
        if i == 0:
            # For the first chunk, use "wb" (write binary) mode
            pickle.dump(chunk, file)
        else:
            # For subsequent chunks, use "ab" (append binary) mode
            pickle.dump(chunk, file)

# Close the progress bar
progress_bar.close()

100%|██████████| 577082/577082 [03:29<00:00, 2760.12it/s] 
