In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install rapidfuzz gradio sentence-transformers datasets faiss-cpu

In [None]:
import pandas as pd
from rapidfuzz import process
import numpy as np
import gradio as gr
from datasets import Dataset
import faiss
from itertools import combinations
from sentence_transformers import SentenceTransformer, util , InputExample
from sentence_transformers import losses
from sklearn.utils import shuffle
from torch.utils.data import DataLoader

In [None]:
# Increase display width for columns
pd.set_option('display.max_colwidth', None)

In [None]:
%cd '/content/drive/MyDrive/github/book_recommendation/book_recommender'

/content/drive/MyDrive/github/book_recommendation/book_recommender


In [None]:
def load_data(file_path):
    """
    Load the dataset from a CSV file.
    Args:
        file_path (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded dataset.
    """
    try:
        data = pd.read_csv(file_path)
        if not all(col in data.columns for col in ['book_name', 'summaries', 'categories']):
            raise ValueError("Dataset must contain 'book_name', 'summaries', and 'categories' columns.")
        return data
    except Exception as e:
        raise FileNotFoundError(f"Error loading file: {e}")

def preprocess_data(data):
    """
    Preprocess the dataset by:
    - Dropping rows with missing values in critical columns.
    - Removing duplicates based on 'book_name' and 'categories'.
    - Grouping categories for each book.
    - Adding a combined text column with summaries and categories.

    Args:
        data (pd.DataFrame): Raw dataset.

    Returns:
        pd.DataFrame: Preprocessed dataset.
    """
    # Drop rows with missing values in 'book_name' and 'summaries'
    data = data.dropna(subset=['book_name', 'summaries']).reset_index(drop=True)

    # Remove duplicates where both 'book_name' and 'categories' are identical
    data = data.drop_duplicates(subset=['book_name', 'categories'], keep='first').reset_index(drop=True)

    # Group categories for each book
    data = data.groupby('book_name', as_index=False).agg({
        'summaries': 'first',  # Retain the first summary
        'categories': ', '.join  # Combine categories into a single string
    })

    # Split categories into a list
    data['categories_list'] = data['categories'].str.split(', ')

    # Combine summaries with joined categories for each row
    data['combined_text'] = data.apply(
        lambda row: row['summaries'] + " " + " ".join(row['categories_list']),
        axis=1
    )
    return data
def generate_embeddings(data, model_name='all-MiniLM-L6-v2'):
    """
    Generate embeddings for book summaries using a pre-trained SBERT model.

    Args:
        data (pd.DataFrame): Preprocessed dataset with 'summaries' column.
        model_name (str): Name of the pre-trained SBERT model.

    Returns:
        np.ndarray: Array of embeddings.
    """
    model = SentenceTransformer(model_name)
    data['embeddings'] = data['summaries'].apply(lambda x: model.encode(x, convert_to_tensor=True))
    return model, data
def generate_pairs(data, num_samples=1000):
    pairs = []
    sampled_combinations = combinations(data.iterrows(), 2)
    for (idx1, row1), (idx2, row2) in sampled_combinations:
        # Compute Jaccard similarity
        common_categories = len(set(row1['categories_list']) & set(row2['categories_list']))
        total_categories = len(set(row1['categories_list']) | set(row2['categories_list']))
        jaccard_similarity = common_categories / total_categories


        # Reuse precomputed embeddings
        semantic_similarity = util.pytorch_cos_sim(row1['embeddings'], row2['embeddings']).item()

        # Final similarity (weighted average)
        combined_similarity = 0.9 * semantic_similarity + 0.1 * jaccard_similarity

        # Append the pair with book names
        pairs.append({
            "book1": row1['book_name'],  # Book name for text1
            "book2": row2['book_name'],  # Book name for text2
            "text1": row1['summaries'] ,
            "text2": row2['summaries'] ,
            "similarity": combined_similarity
        })

    return pd.DataFrame(pairs)
# Define bins for similarity scores
def stratify_data(pairs_df, high_threshold=0.5, low_threshold=0.3, samples_per_bin=5000):
    """
    Stratifies pairs_df into bins based on similarity scores and samples equally from each bin.

    Args:
        pairs_df (pd.DataFrame): DataFrame containing the similarity scores.
        high_threshold (float): Threshold for high similarity.
        low_threshold (float): Threshold for low similarity.
        samples_per_bin (int): Number of samples to draw from each bin.

    Returns:
        pd.DataFrame: Stratified and sampled DataFrame.
    """
    # Define bins
    high_similarity = pairs_df[pairs_df['similarity'] >= high_threshold]
    moderate_similarity = pairs_df[(pairs_df['similarity'] < high_threshold) & (pairs_df['similarity'] >= low_threshold)]
    low_similarity = pairs_df[pairs_df['similarity'] < low_threshold]

    # Sample equally from each bin
    high_sample = high_similarity.sample(min(len(high_similarity), samples_per_bin), random_state=42)
    moderate_sample = moderate_similarity.sample(min(len(moderate_similarity), samples_per_bin), random_state=42)
    low_sample = low_similarity.sample(min(len(low_similarity), samples_per_bin), random_state=42)

    # Combine samples and shuffle
    stratified_data = pd.concat([high_sample, moderate_sample, low_sample])
    return shuffle(stratified_data, random_state=42)

data = load_data('books_summary.csv')
data = preprocess_data(data)
# Create emnedding from pre-trained model for fine-tuning
model,data = generate_embeddings(data)
# Generate pairs
pairs_df = generate_pairs(data)

# Apply stratification
stratified_pairs_df = stratify_data(pairs_df, samples_per_bin=5000)

# Convert stratified pairs to InputExamples
train_examples = [
    InputExample(texts=[row['text1'], row['text2']], label=float(row['similarity']))
    for _, row in stratified_pairs_df.iterrows()
]

# Create DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=128)
# Define the loss function
train_loss = losses.CosineSimilarityLoss(model)

# Fine-tune the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=10,
    warmup_steps=int(0.01 * len(train_dataloader)),  # 10% warmup
    output_path=f'fine_tuned_sbert'
)

# Fine-tuning the sentence Bert model on book summaries

In [None]:
# Obtain embedding from fine-tuned model on preprocess data and

def build_faiss_index(embeddings, index_path="faiss_index.bin"):
    """
    Build a Faiss index for fast nearest-neighbor searches and save it to a file.

    Args:
        embeddings (np.ndarray): Array of book embeddings.
        index_path (str): Path to save the Faiss index.

    Returns:
        faiss.Index: The built Faiss index.
    """
    # Convert embeddings to float32 (required by Faiss)
    embeddings = embeddings.astype('float32')

    # Create a Faiss index
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance (Euclidean)
    index.add(embeddings)  # Add embeddings to the index

    # Save the index
    faiss.write_index(index, index_path)
    return index

data['book_name'] = data['book_name'].str.lower()
model = SentenceTransformer('fine_tuned_sbert')
embeddings = model.encode(data['summaries'], batch_size=16, show_progress_bar=True)

# Build and save the Faiss index
faiss_index = build_faiss_index(embeddings, "faiss_index.bin")
# Save the new embeddings
np.save('book_embeddings.npy', embeddings)
# Save the pre-processed data
data.to_csv('preprocessed_books_data.csv', index=False)  #Save as CSV


# Recommendation function

# Recommendations

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import gradio as gr
from rapidfuzz import fuzz, process

### Step 1: Load Processed Data ###
data = pd.read_csv('preprocessed_books_data.csv')  # Load preprocessed data
data['categories_list'] = data['categories_list'].apply(eval).apply(set)  # Convert categories to sets

### Step 2: Embedding Loading ###
def load_embeddings(embedding_path):
    """
    Load embeddings from a file.
    """
    try:
        embeddings = np.load(embedding_path).astype('float32')  # Ensure float32 for Faiss compatibility
        return embeddings
    except Exception as e:
        raise FileNotFoundError(f"Error loading embeddings: {e}")

embeddings = load_embeddings('book_embeddings.npy')

### Step 3: Build or Load Faiss Index ###
def build_faiss_index(embeddings, index_path="faiss_index.bin"):
    """
    Build a Faiss index for fast nearest-neighbor searches and save it to a file.
    """
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance
    index.add(embeddings)  # Add embeddings to the index
    faiss.write_index(index, index_path)  # Save the index
    return index

def load_faiss_index(index_path="faiss_index.bin"):
    """
    Load a prebuilt Faiss index from a file.
    """
    return faiss.read_index(index_path)

try:
    faiss_index = load_faiss_index("faiss_index.bin")
except FileNotFoundError:
    faiss_index = build_faiss_index(embeddings, "faiss_index.bin")

### Step 4: Recommendation Function ###
def recommend_books_with_faiss(book_title, data, faiss_index, embeddings, top_n=5, min_similarity=60):
    """
    Recommend books similar to the input book using Faiss for nearest-neighbor search.
    """
    # Normalize book titles to lowercase
    book_title = book_title.lower()

    # Fuzzy matching for book title
    if book_title not in data['book_name'].values:
        closest_match = process.extractOne(
            book_title,
            data['book_name'].values,
            scorer=fuzz.token_sort_ratio
        )
        if closest_match is None or closest_match[1] < min_similarity:
            return [f"No close match found for '{book_title}'. Please try another title."], book_title

        book_title = closest_match[0]
        print(f"Giving results for: {book_title}")

    # Find the index of the input book
    input_idx = data[data['book_name'] == book_title].index[0]
    input_embedding = embeddings[input_idx].reshape(1, -1)  # Reshape for Faiss compatibility

    # Use Faiss to find the nearest neighbors
    distances, indices = faiss_index.search(input_embedding, top_n + 1)  # +1 to exclude itself
    indices = indices.flatten()
    distances = distances.flatten()

    # Exclude the input book itself
    indices = indices[1:]
    distances = distances[1:]

    # Convert distances to cosine similarity
    cosine_similarities = 1 - (distances / 2)

    # Filter by categories
    input_categories = data.loc[input_idx, 'categories_list']
    filtered_books = []
    for idx, sim in zip(indices, cosine_similarities):
        if len(input_categories & data.loc[idx, 'categories_list']) > 0:  # Category overlap
            filtered_books.append((data.loc[idx, 'book_name'], sim))
        if len(filtered_books) >= top_n:
            break

    # Fallback: Add recommendations without category filtering
    if len(filtered_books) < top_n:
        remaining_indices = [idx for idx in indices if idx not in [rec[0] for rec in filtered_books]]
        for idx, sim in zip(remaining_indices, cosine_similarities[len(filtered_books):]):
            filtered_books.append((data.loc[idx, 'book_name'], sim))
            if len(filtered_books) >= top_n:
                break

    return filtered_books[:top_n], book_title

### Step 5: Recommendation UI ###
def recommend_ui(book_title):
    recommendations, book_name = recommend_books_with_faiss(book_title, data, faiss_index, embeddings, top_n=5)

    if len(recommendations) < 2:
        return "Book not found in the dataset. Please try another title."

    output_message = f"Giving results for: {book_name}\n\nRecommended Books:\n"
    recommendations_list = "\n".join([f"{rec[0]}" for rec in recommendations])
    return output_message + recommendations_list



In [None]:
recommend_ui(book_title='1984')

'Giving results for: 1984\n\nRecommended Books:\nbrave new world\nantifragile\nthe sovereign individual\ncommon sense\nbrave new world'