In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!pip install rapidfuzz gradio sentence-transformers datasets

Collecting rapidfuzz
  Downloading rapidfuzz-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting gradio
  Downloading gradio-5.13.1-py3-none-any.whl.metadata (16 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.6.0 (from gradio)
  Downloading gradio_client-1.6.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.1

In [7]:
import pandas as pd
from rapidfuzz import process
import numpy as np
import gradio as gr

In [8]:
# Increase display width for columns
pd.set_option('display.max_colwidth', None)

In [9]:
%cd '/content/drive/MyDrive/github/book_recommendation/book_recommender'

/content/drive/MyDrive/github/book_recommendation/book_recommender


In [10]:
data = pd.read_csv(f'books_summary.csv')

In [11]:
# Drop missing values
data = data.dropna(subset=['book_name', 'summaries'])
data = data.reset_index(drop=True)
# Remove duplicates where both book_name and categories are identical
data = data.drop_duplicates(subset=['book_name', 'categories'], keep='first').reset_index(drop=True)
# Group categories for each book
data = data.groupby('book_name', as_index=False).agg({
    'summaries': 'first',  # Retain the first summary
    'categories': ', '.join  # Combine categories into a single string
})
data['categories_list'] = data['categories'].str.split(', ')
# Combine summaries with joined categories for each row
data['combined_text'] = data.apply(
    lambda row: row['summaries'] + " " + " ".join(row['categories_list']),
    axis=1
)

In [12]:
from datasets import Dataset
from sentence_transformers import SentenceTransformer

# Fine-tuning the sentence Bert model on book summaries

In [None]:
data['embeddings'] = data['summaries'].apply(lambda x: model.encode(x, convert_to_tensor=True))

In [None]:
from itertools import combinations
from sentence_transformers import SentenceTransformer, util

# Load pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_pairs(data, num_samples=1000):
    pairs = []
    sampled_combinations = combinations(data.iterrows(), 2)
    for (idx1, row1), (idx2, row2) in sampled_combinations:
        # Compute Jaccard similarity
        common_categories = len(set(row1['categories_list']) & set(row2['categories_list']))
        total_categories = len(set(row1['categories_list']) | set(row2['categories_list']))
        jaccard_similarity = common_categories / total_categories


        # Reuse precomputed embeddings
        semantic_similarity = util.pytorch_cos_sim(row1['embeddings'], row2['embeddings']).item()

        # Final similarity (weighted average)
        combined_similarity = 0.9 * semantic_similarity + 0.1 * jaccard_similarity

        # Append the pair with book names
        pairs.append({
            "book1": row1['book_name'],  # Book name for text1
            "book2": row2['book_name'],  # Book name for text2
            "text1": row1['summaries'] ,
            "text2": row2['summaries'] ,
            "similarity": combined_similarity
        })

        # Stop if we've generated enough samples
        #if len(pairs) >= num_samples:
        #    break

    return pd.DataFrame(pairs)

# Generate pairs
pairs_df = generate_pairs(data)

# Display the first few rows
#print(pairs_df.head())


In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sentence_transformers import SentenceTransformer, InputExample
from torch.utils.data import DataLoader

# Define bins for similarity scores
def stratify_data(pairs_df, high_threshold=0.5, low_threshold=0.3, samples_per_bin=5000):
    """
    Stratifies pairs_df into bins based on similarity scores and samples equally from each bin.

    Args:
        pairs_df (pd.DataFrame): DataFrame containing the similarity scores.
        high_threshold (float): Threshold for high similarity.
        low_threshold (float): Threshold for low similarity.
        samples_per_bin (int): Number of samples to draw from each bin.

    Returns:
        pd.DataFrame: Stratified and sampled DataFrame.
    """
    # Define bins
    high_similarity = pairs_df[pairs_df['similarity'] >= high_threshold]
    moderate_similarity = pairs_df[(pairs_df['similarity'] < high_threshold) & (pairs_df['similarity'] >= low_threshold)]
    low_similarity = pairs_df[pairs_df['similarity'] < low_threshold]

    # Sample equally from each bin
    high_sample = high_similarity.sample(min(len(high_similarity), samples_per_bin), random_state=42)
    moderate_sample = moderate_similarity.sample(min(len(moderate_similarity), samples_per_bin), random_state=42)
    low_sample = low_similarity.sample(min(len(low_similarity), samples_per_bin), random_state=42)

    # Combine samples and shuffle
    stratified_data = pd.concat([high_sample, moderate_sample, low_sample])
    return shuffle(stratified_data, random_state=42)

# Apply stratification
stratified_pairs_df = stratify_data(pairs_df, samples_per_bin=5000)

# Convert stratified pairs to InputExamples
train_examples = [
    InputExample(texts=[row['text1'], row['text2']], label=float(row['similarity']))
    for _, row in stratified_pairs_df.iterrows()
]

# Create DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=128)

# Check stratification
print("High Similarity Samples:", len(stratified_pairs_df[stratified_pairs_df['similarity'] >= 0.5]))
print("Moderate Similarity Samples:", len(stratified_pairs_df[(stratified_pairs_df['similarity'] < 0.5) & (stratified_pairs_df['similarity'] >= 0.3)]))
print("Low Similarity Samples:", len(stratified_pairs_df[stratified_pairs_df['similarity'] < 0.3]))


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from sentence_transformers import losses

# Define the loss function
train_loss = losses.CosineSimilarityLoss(model)

# Fine-tune the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=10,
    warmup_steps=int(0.01 * len(train_dataloader)),  # 10% warmup
    output_path=f'fine_tuned_sbert'
)


# Recommendation function

In [13]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [14]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
from rapidfuzz import process


### Step 1: Data Loading ###
def load_data(file_path):
    try:
        data = pd.read_csv(file_path)
        if not all(col in data.columns for col in ['book_name', 'summaries', 'categories']):
            raise ValueError("Dataset must contain 'book_name', 'summaries', and 'categories' columns.")
        return data
    except Exception as e:
        raise FileNotFoundError(f"Error loading file: {e}")


### Step 2: Preprocessing ###
def preprocess_data(data):
    data = data.dropna(subset=['book_name', 'summaries']).reset_index(drop=True)
    data = data.drop_duplicates(subset=['book_name', 'categories'], keep='first').reset_index(drop=True)
    data = data.groupby('book_name', as_index=False).agg({
        'summaries': 'first',
        'categories': ', '.join
    })
    data['categories_list'] = data['categories'].str.split(', ')
    data['combined_text'] = data.apply(
        lambda row: row['summaries'] + " " + " ".join(row['categories_list']),
        axis=1
    )
    return data


### Step 3: Embedding Loading ###
def load_embeddings(embedding_path):
    try:
        embeddings = np.load(embedding_path)
        return embeddings
    except Exception as e:
        raise FileNotFoundError(f"Error loading embeddings: {e}")


In [15]:
# process data and embeddings
data = preprocess_data(load_data('books_summary.csv'))
data['book_name'] = data['book_name'].str.lower()
model = SentenceTransformer('fine_tuned_sbert')
embeddings = model.encode(data['summaries'], batch_size=16, show_progress_bar=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/77 [00:00<?, ?it/s]

In [16]:
import faiss

def build_faiss_index(embeddings, index_path="faiss_index.bin"):
    """
    Build a Faiss index for fast nearest-neighbor searches and save it to a file.

    Args:
        embeddings (np.ndarray): Array of book embeddings.
        index_path (str): Path to save the Faiss index.

    Returns:
        faiss.Index: The built Faiss index.
    """
    # Convert embeddings to float32 (required by Faiss)
    embeddings = embeddings.astype('float32')

    # Create a Faiss index
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance (Euclidean)
    index.add(embeddings)  # Add embeddings to the index

    # Save the index
    faiss.write_index(index, index_path)
    return index

# Build and save the Faiss index
embeddings = np.load('book_embeddings.npy')  # Load embeddings
faiss_index = build_faiss_index(embeddings, "faiss_index.bin")


In [None]:
# Save data and embeddings
data.to_csv('preprocessed_books_data.csv', index=False)  # Save as CSV
np.save('book_embeddings.npy', embeddings)

In [90]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def precompute_similarity_matrix(embeddings, output_path="similarity_matrix.npy"):
    """
    Precompute pairwise cosine similarity for all embeddings and save it.

    Args:
        embeddings (np.ndarray): Precomputed embeddings for all books.
        output_path (str): Path to save the similarity matrix.

    Returns:
        np.ndarray: Precomputed similarity matrix.
    """
    similarity_matrix = cosine_similarity(embeddings)
    np.save(output_path, similarity_matrix)  # Save to a file
    return similarity_matrix


In [91]:
# Precompute and save similarity matrix
similarity_matrix = precompute_similarity_matrix(embeddings, "similarity_matrix.npy")


# Inference

In [17]:
def load_faiss_index(index_path="faiss_index.bin"):
    """
    Load a prebuilt Faiss index from a file.

    Args:
        index_path (str): Path to the Faiss index file.

    Returns:
        faiss.Index: The loaded Faiss index.
    """
    return faiss.read_index(index_path)

# Load the Faiss index
faiss_index = load_faiss_index("faiss_index.bin")


In [18]:
def recommend_books_with_faiss(book_title, data, faiss_index, embeddings, top_n=5, min_similarity=60):
    # Normalize book titles to lowercase
    book_title = book_title.lower()

    # Fuzzy matching for book title
    if book_title not in data['book_name'].values:
        closest_match = process.extractOne(
            book_title,
            data['book_name'].values,
            scorer=fuzz.token_sort_ratio
        )
        if closest_match is None or closest_match[1] < min_similarity:
            return [f"No close match found for '{book_title}'. Please try another title."], book_title

        book_title = closest_match[0]
        print(f"Giving results for: {book_title}")

    # Find the index of the input book
    input_idx = data[data['book_name'] == book_title].index[0]
    input_embedding = embeddings[input_idx].astype('float32')  # Ensure float32 for Faiss

    # Use Faiss to find the nearest neighbors
    distances, indices = faiss_index.search(np.array([input_embedding]), top_n + 1)  # +1 to exclude itself
    indices = indices.flatten()
    distances = distances.flatten()

    # Remove the input book itself from results
    indices = indices[1:]
    distances = distances[1:]

    # Get book names and similarities
    recommended_books = [(data.iloc[idx]['book_name'], 1 - (dist / max(distances))) for idx, dist in zip(indices, distances)]

    return recommended_books, book_title


In [19]:
def recommend_ui(book_title):
    print('The book you entered is:', book_title)  # Console log for debugging
    recommendations, book_name = recommend_books_with_faiss(book_title, data, faiss_index, embeddings, top_n=5)

    if len(recommendations) < 2:
        return "Book not found in the dataset. Please try another title."

    output_message = f"Giving results for: {book_name}\n\nRecommended Books:\n"
    recommendations_list = "\n".join([f"{rec[0]}" for rec in recommendations])
    return output_message + recommendations_list


In [20]:
recommend_ui(book_title='1984')

The book you entered is: 1984


'Giving results for: 1984\n\nRecommended Books:\nbrave new world\nantifragile\nthe sovereign individual\nthe education of a value investor\ncommon sense'

In [None]:
Giving results for: 1984\n\nRecommended Books:\nbrave new world\nantifragile\nthe sovereign individual\ncommon sense\nreal help

In [102]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import gradio as gr
from rapidfuzz import fuzz, process
from joblib import Parallel, delayed

### Step 1: Load processed data ###
data = pd.read_csv('preprocessed_books_data.csv')  # Load from CSV

# Convert categories to sets during initialization for faster filtering
data['categories_list'] = data['categories_list'].apply(eval).apply(set)

### Step 2: Embedding Loading ###
def load_embeddings(embedding_path):
    try:
        embeddings = np.load(embedding_path)
        return embeddings
    except Exception as e:
        raise FileNotFoundError(f"Error loading embeddings: {e}")

# Step 3: Load precomputed similarity matrix ###
def load_similarity_matrix(similarity_path, embeddings, reduce_dim=True):
    try:
        return np.load(similarity_path)
    except FileNotFoundError:
        # If similarity matrix is missing, compute it
        if reduce_dim:
            print("Reducing embedding dimensions using PCA...")
            pca = PCA(n_components=128)
            embeddings = pca.fit_transform(embeddings)
            np.save("reduced_embeddings.npy", embeddings)

        print("Computing similarity matrix...")
        similarity_matrix = cosine_similarity(embeddings)
        np.save(similarity_path, similarity_matrix)
        return similarity_matrix

embeddings = load_embeddings('book_embeddings.npy')
similarity_matrix = load_similarity_matrix("similarity_matrix.npy", embeddings)

### Step 4: Recommendation Cache ###
recommendation_cache = {}

def recommend_books_with_category_filter(book_title, data, similarity_matrix, top_n=5, min_similarity=60):
    # Check cache first
    if book_title in recommendation_cache:
        return recommendation_cache[book_title]

    # Normalize book titles to lowercase
    book_title = book_title.lower()

    # Adjust the similarity threshold for numeric titles
    if book_title.isdigit():
        min_similarity = 50

    # Fuzzy matching
    if book_title not in data['book_name'].values:
        # Narrow down candidates with substring filtering
        candidates = [name for name in data['book_name'] if book_title in name]
        if not candidates:
            candidates = data['book_name'].values

        closest_match = process.extractOne(
            book_title,
            candidates,
            scorer=fuzz.token_sort_ratio
        )

        if closest_match is None or closest_match[1] < min_similarity:
            return [f"No close match found for '{book_title}'. Please try another title."], book_title

        book_title = closest_match[0]
        print(f"Giving results for: {book_title}")

    # Find the index of the input book
    input_idx = data[data['book_name'] == book_title].index[0]

    # Use precomputed similarities
    similarity_scores = similarity_matrix[input_idx]
    similarity_scores[input_idx] = -1  # Exclude the input book

    # Filter by categories with NumPy
    input_categories = data.loc[input_idx, 'categories_list']
    category_filter = np.array([
        len(input_categories & categories) > 0 for categories in data['categories_list']
    ])
    data_filtered = data.loc[category_filter].copy()
    data_filtered['similarity'] = similarity_scores[category_filter]

    # Get top recommendations
    recommended_books = data_filtered.sort_values(by='similarity', ascending=False).head(top_n)
    recommendations = recommended_books[['book_name', 'similarity']].values.tolist()

    # Cache the results
    recommendation_cache[book_title] = (recommendations, book_title)
    return recommendations, book_title

### Step 5: Recommendation UI ###
def recommend_ui(book_title):
    recommendations, book_name = recommend_books_with_category_filter(book_title, data, similarity_matrix, top_n=5)

    if len(recommendations) < 2:
        return "Book not found in the dataset. Please try another title."

    output_message = f"Giving results for: {book_name}\n\nRecommended Books:\n"
    recommendations_list = "\n".join([f"{rec[0]}" for rec in recommendations])
    return output_message + recommendations_list



In [104]:
recommend_ui(book_title='1984')

'Giving results for: 1984\n\nRecommended Books:\nbrave new world\nantifragile\nthe sovereign individual\ncommon sense\nreal help'