In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
!pip install rapidfuzz gradio sentence-transformers datasets faiss-cpu

Collecting rapidfuzz
  Downloading rapidfuzz-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting gradio
  Downloading gradio-5.13.1-py3-none-any.whl.metadata (16 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.6.0 (from gradio)
  Downloading gradio_client-1.6.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 k

In [7]:
import pandas as pd
from rapidfuzz import process
import numpy as np
import gradio as gr
from datasets import Dataset
import faiss
from itertools import combinations
from sentence_transformers import SentenceTransformer, util , InputExample
from sentence_transformers import losses
from sklearn.utils import shuffle
from torch.utils.data import DataLoader

In [2]:
%cd '/content/drive/MyDrive/github/book_recommendation/book_recommender'

/content/drive/MyDrive/github/book_recommendation/book_recommender


In [71]:
data = pd.read_csv('books_summary.csv')
# Remove duplicates where both 'book_name' and 'categories' are identical
data = data.drop_duplicates(subset=['book_name'], keep='first').reset_index(drop=True)
# Drop rows with missing values in 'book_name' and 'summaries'
missing_or_empty_rows = data[
(data['book_name'].isna() | (data['book_name'].str.strip() == "") | (data['book_name'].str.strip() == ".")) |
(data['summaries'].isna() | (data['summaries'].str.strip() == "") | (data['summaries'].str.strip() == "."))]

# Drop these rows
data_cleaned = data.drop(missing_or_empty_rows.index)



In [72]:
missing_or_empty_rows.shape

(6, 4)

In [73]:
missing_or_empty_rows

Unnamed: 0.1,Unnamed: 0,book_name,summaries,categories
12,22,This Is Your Mind On Plants,,science
412,598,The Year of Magical Thinking,,relationships
595,829,The Power Of Full Engagement,,relationships
869,1403,The Little Book That (Still) Beats The Market ...,.,money
998,1796,Think Like A Freak teaches you how to reject c...,.,productivity
1003,1802,The Eureka Factor lays out the history of so-c...,.,productivity


In [74]:
# Reset the index after dropping rows (optional)
data_cleaned.reset_index(drop=True, inplace=True)


In [53]:
# Group categories for each book
data = data.groupby('book_name', as_index=False).agg({
    'summaries': 'first',  # Retain the first summary
    'categories': ', '.join  # Combine categories into a single string
})


In [55]:
# Split categories into a list
data['categories_list'] = data['categories'].str.split(', ')

data

Unnamed: 0,book_name,summaries,categories,categories_list
0,"Outer Order, Inner Calm",gives you advice to declutter your space and ...,"happiness, productivity, psychology, health, w...","[happiness, productivity, psychology, health, ..."
1,The Book,is a spiritual exploration of true human natur...,mindfulness,[mindfulness]
2,#GIRLBOSS,shows that even an unconventional life can le...,"motivation, business, creativity, work","[motivation, business, creativity, work]"
3,10 Days To Faster Reading,helps you bring your reading skills to the cu...,"productivity, psychology, education","[productivity, psychology, education]"
4,10% Happier,"gives skeptics an easy “in” to meditation, by...","happiness, psychology, mindfulness","[happiness, psychology, mindfulness]"
...,...,...,...,...
1226,Your Move: The Underdog’s Guide to Building Yo...,is Ramit Sethi’s no-BS guide to starting your...,"money, marketing, business, work","[money, marketing, business, work]"
1227,You’ll See It When You Believe It,"shows you how to discover your true, best sel...","productivity, psychology, motivation, work","[productivity, psychology, motivation, work]"
1228,You’re Not Listening,is a book that will improve your communicatio...,"relationships, happiness, productivity, manage...","[relationships, happiness, productivity, manag..."
1229,Zero To One,is an inside look at Peter Thiel’s philosophy...,"productivity, motivation, marketing, managemen...","[productivity, motivation, marketing, manageme..."


In [80]:
def load_data(file_path):
    """
    Load the dataset from a CSV file.
    Args:
        file_path (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded dataset.
    """
    try:
        data = pd.read_csv(file_path)
        if not all(col in data.columns for col in ['book_name', 'summaries', 'categories']):
            raise ValueError("Dataset must contain 'book_name', 'summaries', and 'categories' columns.")
        return data
    except Exception as e:
        raise FileNotFoundError(f"Error loading file: {e}")

def preprocess_data(data):
    """
    Preprocess the dataset by:
    - Dropping rows with missing values in critical columns.
    - Removing duplicates based on 'book_name' and 'categories'.
    - Grouping categories for each book.
    - Adding a combined text column with summaries and categories.

    Args:
        data (pd.DataFrame): Raw dataset.

    Returns:
        pd.DataFrame: Preprocessed dataset.
    """
    # Remove duplicates where both 'book_name' and 'categories' are identical
    data = data.drop_duplicates(subset=['book_name'], keep='first').reset_index(drop=True)
    # Drop rows with missing values in 'book_name' and 'summaries'
    missing_or_empty_rows = data[
    (data['book_name'].isna() | (data['book_name'].str.strip() == "") | (data['book_name'].str.strip() == ".")) |
    (data['summaries'].isna() | (data['summaries'].str.strip() == "") | (data['summaries'].str.strip() == "."))]

    # Drop these rows
    data = data.drop(missing_or_empty_rows.index)

    # Reset the index after dropping rows (optional)
    data.reset_index(drop=True, inplace=True)


    # Group categories for each book
    data = data.groupby('book_name', as_index=False).agg({
        'summaries': 'first',  # Retain the first summary
        'categories': ', '.join  # Combine categories into a single string
    })

    # Split categories into a list
    data['categories_list'] = data['categories'].str.split(', ')

    return data
def generate_embeddings(data, model_name='all-MiniLM-L6-v2'):
    """
    Generate embeddings for book summaries using a pre-trained SBERT model.

    Args:
        data (pd.DataFrame): Preprocessed dataset with 'summaries' column.
        model_name (str): Name of the pre-trained SBERT model.

    Returns:
        np.ndarray: Array of embeddings.
    """
    model = SentenceTransformer(model_name)
    data['embeddings'] = data['summaries'].apply(lambda x: model.encode(x, convert_to_tensor=True))
    return model, data
def generate_pairs(data, num_samples=1000):
    pairs = []
    sampled_combinations = combinations(data.iterrows(), 2)
    for (idx1, row1), (idx2, row2) in sampled_combinations:
        # Compute Jaccard similarity
        common_categories = len(set(row1['categories_list']) & set(row2['categories_list']))
        total_categories = len(set(row1['categories_list']) | set(row2['categories_list']))
        jaccard_similarity = common_categories / total_categories


        # Reuse precomputed embeddings
        semantic_similarity = util.pytorch_cos_sim(row1['embeddings'], row2['embeddings']).item()

        # Final similarity (weighted average)
        combined_similarity = 0.9 * semantic_similarity + 0.1 * jaccard_similarity

        # Append the pair with book names
        pairs.append({
            "book1": row1['book_name'],  # Book name for text1
            "book2": row2['book_name'],  # Book name for text2
            "text1": row1['summaries'] ,
            "text2": row2['summaries'] ,
            "similarity": combined_similarity
        })

    return pd.DataFrame(pairs)
# Define bins for similarity scores
def stratify_data(pairs_df, high_threshold=0.5, low_threshold=0.3, samples_per_bin=5000):
    """
    Stratifies pairs_df into bins based on similarity scores and samples equally from each bin.

    Args:
        pairs_df (pd.DataFrame): DataFrame containing the similarity scores.
        high_threshold (float): Threshold for high similarity.
        low_threshold (float): Threshold for low similarity.
        samples_per_bin (int): Number of samples to draw from each bin.

    Returns:
        pd.DataFrame: Stratified and sampled DataFrame.
    """
    # Define bins
    high_similarity = pairs_df[pairs_df['similarity'] >= high_threshold]
    moderate_similarity = pairs_df[(pairs_df['similarity'] < high_threshold) & (pairs_df['similarity'] >= low_threshold)]
    low_similarity = pairs_df[pairs_df['similarity'] < low_threshold]

    # Sample equally from each bin
    high_sample = high_similarity.sample(min(len(high_similarity), samples_per_bin), random_state=42)
    moderate_sample = moderate_similarity.sample(min(len(moderate_similarity), samples_per_bin), random_state=42)
    low_sample = low_similarity.sample(min(len(low_similarity), samples_per_bin), random_state=42)

    # Combine samples and shuffle
    stratified_data = pd.concat([high_sample, moderate_sample, low_sample])
    return shuffle(stratified_data, random_state=42)



# Pre-processing

In [81]:
data = load_data('books_summary.csv')
data = preprocess_data(data)
data.to_csv('preprocessed_books_data.csv', index=False)

#Training data prepration

In [85]:
# Create emnedding from pre-trained model for fine-tuning
model,data = generate_embeddings(data,model_name='all-MiniLM-L6-v2')
# Generate pairs
pairs_df = generate_pairs(data)

# Apply stratification
stratified_pairs_df = stratify_data(pairs_df, samples_per_bin=5000)

# Convert stratified pairs to InputExamples
train_examples = [
    InputExample(texts=[row['text1'], row['text2']], label=float(row['similarity']))
    for _, row in stratified_pairs_df.iterrows()
]

In [86]:
stratified_pairs_df.sort_values(by = 'similarity', ascending = False)

Unnamed: 0,book1,book2,text1,text2,similarity
489344,Love Warrior,Untamed,"delves into the life of Glennon Doyle, a woma...","is an inspiring memoir of Glennon Doyle, a wo...",0.850252
572135,Radical Acceptance,The Miracle of Mindfulness,teaches how you can become more content and h...,teaches the ancient Buddhist practice of mind...,0.833465
206647,Chasing Excellence,Mind Gym,breaks down how world-class athletes achieve ...,explains why the performance of world-class a...,0.814327
409265,How To Be An Antiracist,So You Want To Talk About Race,"will make you a better, kinder, and more fair...","will help you make the world a better, fairer...",0.810699
169198,Brainfluence,Brandwashed,will help you get more sales by revealing peo...,will help you make better buying decisions by...,0.801191
...,...,...,...,...,...
476518,Letters From A Stoic,Team Of Teams,is a collection of moral epistles famous Roma...,reveals the incredible power that small teams...,-0.059970
611974,Spy the Lie,The Selfish Gene,is a collection of professional tips on how t...,explains the process of evolution in biology ...,-0.063253
660296,The Automatic Millionaire,The Sunflower,"is an actionable, step-by-step plan for build...",recounts an experience of holocaust survivor ...,-0.066321
337062,First They Killed My Father,Growth Hacker Marketing,is Loung Ung’s account of the horrific events...,explains the 4-step framework today’s startup...,-0.076454


In [88]:
# Check stratification
print("High Similarity Samples:", len(stratified_pairs_df[stratified_pairs_df['similarity'] >= 0.5]))
print("Moderate Similarity Samples:", len(stratified_pairs_df[(stratified_pairs_df['similarity'] < 0.5) & (stratified_pairs_df['similarity'] >= 0.3)]))
print("Low Similarity Samples:", len(stratified_pairs_df[stratified_pairs_df['similarity'] < 0.3]))


High Similarity Samples: 5000
Moderate Similarity Samples: 5000
Low Similarity Samples: 5000


# Fine-tuning model

In [90]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Create DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=128)
# Define the loss function
#model = SentenceTransformer('all-MiniLM-L6-v2')
train_loss = losses.CosineSimilarityLoss(model)

# Fine-tune the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=10,
    warmup_steps=int(0.01 * len(train_dataloader)),  # 10% warmup
    output_path=f'fine_tuned_sbert'
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


Step,Training Loss
500,0.0021


High Similarity Samples: 5000
Moderate Similarity Samples: 5000
Low Similarity Samples: 5000


# Fine-tuned model embeddings

In [20]:
# Obtain embedding from fine-tuned model on preprocess data and

def build_faiss_index(embeddings, index_path="faiss_index.bin"):
    """
    Build a Faiss index for fast nearest-neighbor searches and save it to a file.

    Args:
        embeddings (np.ndarray): Array of book embeddings.
        index_path (str): Path to save the Faiss index.

    Returns:
        faiss.Index: The built Faiss index.
    """
    # Convert embeddings to float32 (required by Faiss)
    embeddings = embeddings.astype('float32')

    # Create a Faiss index
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance (Euclidean)
    index.add(embeddings)  # Add embeddings to the index

    # Save the index
    faiss.write_index(index, index_path)
    return index

data['book_name'] = data['book_name'].str.lower()
model = SentenceTransformer('fine_tuned_sbert')
embeddings = model.encode(data['summaries'], batch_size=16, show_progress_bar=True)

# Build and save the Faiss index
faiss_index = build_faiss_index(embeddings, "faiss_index.bin")
# Save the new embeddings
np.save('book_embeddings.npy', embeddings)


Batches:   0%|          | 0/77 [00:00<?, ?it/s]

# Recommendations

In [29]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import gradio as gr
from rapidfuzz import fuzz, process

### Step 1: Load Processed Data ###
data = pd.read_csv('preprocessed_books_data.csv')  # Load preprocessed data
data['categories_list'] = data['categories_list'].apply(eval).apply(set)  # Convert categories to sets

### Step 2: Embedding Loading ###
def load_embeddings(embedding_path):
    """
    Load embeddings from a file.
    """
    try:
        embeddings = np.load(embedding_path).astype('float32')  # Ensure float32 for Faiss compatibility
        return embeddings
    except Exception as e:
        raise FileNotFoundError(f"Error loading embeddings: {e}")

embeddings = load_embeddings('book_embeddings.npy')

### Step 3: Build or Load Faiss Index ###
def build_faiss_index(embeddings, index_path="faiss_index.bin"):
    """
    Build a Faiss index for fast nearest-neighbor searches and save it to a file.
    """
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance
    index.add(embeddings)  # Add embeddings to the index
    faiss.write_index(index, index_path)  # Save the index
    return index

def load_faiss_index(index_path="faiss_index.bin"):
    """
    Load a prebuilt Faiss index from a file.
    """
    return faiss.read_index(index_path)

try:
    faiss_index = load_faiss_index("faiss_index.bin")
except FileNotFoundError:
    faiss_index = build_faiss_index(embeddings, "faiss_index.bin")

### Step 4: Recommendation Function ###
def recommend_books_with_faiss(book_title, data, faiss_index, embeddings, top_n=5, min_similarity=60):
    """
    Recommend books similar to the input book using Faiss for nearest-neighbor search.
    """
    # Normalize book titles to lowercase
    book_title = book_title.lower()

    # Fuzzy matching for book title
    if book_title not in data['book_name'].values:
        closest_match = process.extractOne(
            book_title,
            data['book_name'].values,
            scorer=fuzz.token_sort_ratio
        )
        if closest_match is None or closest_match[1] < min_similarity:
            return [f"No close match found for '{book_title}'. Please try another title."], book_title

        book_title = closest_match[0]
        print(f"Giving results for: {book_title}")

    # Find the index of the input book
    input_idx = data[data['book_name'] == book_title].index[0]
    input_embedding = embeddings[input_idx].reshape(1, -1)  # Reshape for Faiss compatibility

    # Use Faiss to find the nearest neighbors
    distances, indices = faiss_index.search(input_embedding, top_n + 1)  # +1 to exclude itself
    indices = indices.flatten()
    distances = distances.flatten()

    # Exclude the input book itself
    indices = indices[1:]
    distances = distances[1:]

    # Convert distances to cosine similarity
    cosine_similarities = 1 - (distances / 2)

    # Filter by categories
    input_categories = data.loc[input_idx, 'categories_list']
    filtered_books = []
    for idx, sim in zip(indices, cosine_similarities):
        if len(input_categories & data.loc[idx, 'categories_list']) > 0:  # Category overlap
            filtered_books.append((data.loc[idx, 'book_name'], sim))
        if len(filtered_books) >= top_n:
            break

    # Fallback: Add recommendations without category filtering
    if len(filtered_books) < top_n:
        remaining_indices = [idx for idx in indices if idx not in [rec[0] for rec in filtered_books]]
        for idx, sim in zip(remaining_indices, cosine_similarities[len(filtered_books):]):
            filtered_books.append((data.loc[idx, 'book_name'], sim))
            if len(filtered_books) >= top_n:
                break

    return filtered_books[:top_n], book_title

### Step 5: Recommendation UI ###
def recommend_ui(book_title):
    recommendations, book_name = recommend_books_with_faiss(book_title, data, faiss_index, embeddings, top_n=5)

    if len(recommendations) < 2:
        return "Book not found in the dataset. Please try another title."

    output_message = f"Giving results for: {book_name}\n\n" + "\n".join([f"{rec[0]} (Similarity: {rec[1]:.4f})" for rec in recommendations])
    return output_message



In [30]:
recommend_ui(book_title='1984')

'Giving results for: 1984\n\nBrave New World (Similarity: 0.5129)\nAntifragile (Similarity: 0.4777)\nSiddhartha (Similarity: 0.4376)\nRich Dad’s Cashflow Quadrant (Similarity: 0.4268)\nBrave New World (Similarity: 0.4268)'

In [26]:
data[data['book_name'] == 'Brave New World']


Unnamed: 0,book_name,summaries,categories,categories_list,combined_text
151,Brave New World,presents a futuristic society engineered perf...,"science, politics, economics, relationships, h...","{politics, fiction, religion, relationships, s...",presents a futuristic society engineered perf...


In [28]:
data.shape

(1230, 5)