In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
# Increase display width for columns
pd.set_option('display.max_colwidth', None)

In [9]:
%cd '/content/drive/MyDrive/github/book_recommendation/book_recommender'

/content/drive/MyDrive/github/book_recommendation/book_recommender


In [10]:
data = pd.read_csv(f'books_summary.csv')

In [11]:
# Drop missing values
data = data.dropna(subset=['book_name', 'summaries'])
data = data.reset_index(drop=True)
# Remove duplicates where both book_name and categories are identical
data = data.drop_duplicates(subset=['book_name', 'categories'], keep='first').reset_index(drop=True)
# Group categories for each book
data = data.groupby('book_name', as_index=False).agg({
    'summaries': 'first',  # Retain the first summary
    'categories': ', '.join  # Combine categories into a single string
})
data['categories_list'] = data['categories'].str.split(', ')
# Combine summaries with joined categories for each row
data['combined_text'] = data.apply(
    lambda row: row['summaries'] + " " + " ".join(row['categories_list']),
    axis=1
)

In [12]:
!pip install sentence-transformers datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [14]:
from datasets import Dataset
from sentence_transformers import SentenceTransformer

# Fine-tuning the sentence Bert model on book summaries

In [None]:
data['embeddings'] = data['summaries'].apply(lambda x: model.encode(x, convert_to_tensor=True))

In [None]:
from itertools import combinations
from sentence_transformers import SentenceTransformer, util

# Load pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_pairs(data, num_samples=1000):
    pairs = []
    sampled_combinations = combinations(data.iterrows(), 2)
    for (idx1, row1), (idx2, row2) in sampled_combinations:
        # Compute Jaccard similarity
        common_categories = len(set(row1['categories_list']) & set(row2['categories_list']))
        total_categories = len(set(row1['categories_list']) | set(row2['categories_list']))
        jaccard_similarity = common_categories / total_categories


        # Reuse precomputed embeddings
        semantic_similarity = util.pytorch_cos_sim(row1['embeddings'], row2['embeddings']).item()

        # Final similarity (weighted average)
        combined_similarity = 0.9 * semantic_similarity + 0.1 * jaccard_similarity

        # Append the pair with book names
        pairs.append({
            "book1": row1['book_name'],  # Book name for text1
            "book2": row2['book_name'],  # Book name for text2
            "text1": row1['summaries'] ,
            "text2": row2['summaries'] ,
            "similarity": combined_similarity
        })

        # Stop if we've generated enough samples
        #if len(pairs) >= num_samples:
        #    break

    return pd.DataFrame(pairs)

# Generate pairs
pairs_df = generate_pairs(data)

# Display the first few rows
#print(pairs_df.head())


In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sentence_transformers import SentenceTransformer, InputExample
from torch.utils.data import DataLoader

# Define bins for similarity scores
def stratify_data(pairs_df, high_threshold=0.5, low_threshold=0.3, samples_per_bin=5000):
    """
    Stratifies pairs_df into bins based on similarity scores and samples equally from each bin.

    Args:
        pairs_df (pd.DataFrame): DataFrame containing the similarity scores.
        high_threshold (float): Threshold for high similarity.
        low_threshold (float): Threshold for low similarity.
        samples_per_bin (int): Number of samples to draw from each bin.

    Returns:
        pd.DataFrame: Stratified and sampled DataFrame.
    """
    # Define bins
    high_similarity = pairs_df[pairs_df['similarity'] >= high_threshold]
    moderate_similarity = pairs_df[(pairs_df['similarity'] < high_threshold) & (pairs_df['similarity'] >= low_threshold)]
    low_similarity = pairs_df[pairs_df['similarity'] < low_threshold]

    # Sample equally from each bin
    high_sample = high_similarity.sample(min(len(high_similarity), samples_per_bin), random_state=42)
    moderate_sample = moderate_similarity.sample(min(len(moderate_similarity), samples_per_bin), random_state=42)
    low_sample = low_similarity.sample(min(len(low_similarity), samples_per_bin), random_state=42)

    # Combine samples and shuffle
    stratified_data = pd.concat([high_sample, moderate_sample, low_sample])
    return shuffle(stratified_data, random_state=42)

# Apply stratification
stratified_pairs_df = stratify_data(pairs_df, samples_per_bin=5000)

# Convert stratified pairs to InputExamples
train_examples = [
    InputExample(texts=[row['text1'], row['text2']], label=float(row['similarity']))
    for _, row in stratified_pairs_df.iterrows()
]

# Create DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=128)

# Check stratification
print("High Similarity Samples:", len(stratified_pairs_df[stratified_pairs_df['similarity'] >= 0.5]))
print("Moderate Similarity Samples:", len(stratified_pairs_df[(stratified_pairs_df['similarity'] < 0.5) & (stratified_pairs_df['similarity'] >= 0.3)]))
print("Low Similarity Samples:", len(stratified_pairs_df[stratified_pairs_df['similarity'] < 0.3]))


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from sentence_transformers import losses

# Define the loss function
train_loss = losses.CosineSimilarityLoss(model)

# Fine-tune the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=10,
    warmup_steps=int(0.01 * len(train_dataloader)),  # 10% warmup
    output_path=f'{path}fine_tuned_sbert'
)


# Recommendation function

In [29]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
from rapidfuzz import process


### Step 1: Data Loading ###
def load_data(file_path):
    try:
        data = pd.read_csv(file_path)
        if not all(col in data.columns for col in ['book_name', 'summaries', 'categories']):
            raise ValueError("Dataset must contain 'book_name', 'summaries', and 'categories' columns.")
        return data
    except Exception as e:
        raise FileNotFoundError(f"Error loading file: {e}")


### Step 2: Preprocessing ###
def preprocess_data(data):
    data = data.dropna(subset=['book_name', 'summaries']).reset_index(drop=True)
    data = data.drop_duplicates(subset=['book_name', 'categories'], keep='first').reset_index(drop=True)
    data = data.groupby('book_name', as_index=False).agg({
        'summaries': 'first',
        'categories': ', '.join
    })
    data['categories_list'] = data['categories'].str.split(', ')
    data['combined_text'] = data.apply(
        lambda row: row['summaries'] + " " + " ".join(row['categories_list']),
        axis=1
    )
    return data


### Step 3: Embedding Loading ###
def load_embeddings(embedding_path):
    try:
        embeddings = np.load(embedding_path)
        return embeddings
    except Exception as e:
        raise FileNotFoundError(f"Error loading embeddings: {e}")


### Step 4: Recommendation Generation ###

from rapidfuzz import process

def recommend_books_with_category_filter(book_title, data, embeddings, top_n=5):
    # Normalize book titles to lowercase
    book_title = book_title.lower()
    data['book_name'] = data['book_name'].str.lower()

    # Check for exact match
    if book_title not in data['book_name'].values:
        # Use fuzzy matching to find the closest match
        closest_match = process.extractOne(book_title, data['book_name'].values)
        if closest_match is None or closest_match[1] < 70:  # Set a threshold for similarity
            return ["Book not found in the dataset."]
        book_title = closest_match[0]  # Use the closest matching book name
        print(f"Giving results for: {book_title}")

    # Find the index of the input book
    input_idx = data[data['book_name'] == book_title].index[0]
    input_embedding = embeddings[input_idx]
    input_categories = set(data.loc[input_idx, 'categories_list'])

    # Compute cosine similarity
    similarity_scores = cosine_similarity([input_embedding], embeddings).flatten()
    similarity_scores[input_idx] = -1  # Exclude the input book

    # Add similarity scores to a copy of the data
    data_copy = data.copy()
    data_copy['similarity'] = similarity_scores

    # Filter books by category overlap
    data_filtered = data_copy[data_copy['categories_list'].apply(lambda x: len(set(x) & input_categories) > 0)]

    # Sort by similarity score and select top_n recommendations
    recommended_books = data_filtered.sort_values(by='similarity', ascending=False).head(top_n)

    return recommended_books[['book_name', 'similarity']].values.tolist()








In [67]:

### Main Workflow ###
# Load data and embeddings
data = preprocess_data(load_data('books_summary.csv'))
data['book_name'] = data['book_name'].str.lower()
embeddings = load_embeddings('book_embeddings.npy')

In [78]:
def recommend_books_with_category_filter(book_title, data, embeddings, top_n=5, min_similarity=60):
    # Normalize book titles to lowercase
    book_title = book_title.lower()


    # Adjust the similarity threshold for numeric titles
    if book_title.isdigit():
        min_similarity = 50  # Lower threshold for numeric titles

    if book_title not in data['book_name'].values:
        # Use token_sort_ratio from fuzz module
        closest_match = process.extractOne(
            book_title,
            data['book_name'].values,
            scorer=fuzz.token_sort_ratio
        )

        if closest_match is None or closest_match[1] < min_similarity:
            return [f"No close match found for '{book_title}'. Please try another title."]

        book_title = closest_match[0]  # Use the closest matching book name
        print(f"Giving results for: {book_title}")

    # Find the index of the input book
    input_idx = data[data['book_name'] == book_title].index[0]
    input_embedding = embeddings[input_idx]
    input_categories = set(data.loc[input_idx, 'categories_list'])

    # Compute cosine similarity
    similarity_scores = cosine_similarity([input_embedding], embeddings).flatten()
    similarity_scores[input_idx] = -1

    data_copy = data.copy()
    data_copy['similarity'] = similarity_scores

    data_filtered = data_copy[data_copy['categories_list'].apply(lambda x: len(set(x) & input_categories) > 0)]
    recommended_books = data_filtered.sort_values(by='similarity', ascending=False).head(top_n)

    return recommended_books[['book_name', 'similarity']].values.tolist()

### Main Workflow ###



In [87]:
book_title = ' a tale of 5  cities'
recommendations = recommend_books_with_category_filter(book_title, data, embeddings, top_n=5)
if len(recommendations)<2:
    print("Book not found in the dataset. Please try another title.")
else:
  print([f"{rec[0]} (Similarity: {rec[1]:.4f})" for rec in recommendations])

Giving results for: a tale of two cities
['happy together (Similarity: 0.5046)', 'say nothing (Similarity: 0.4844)', 'the social animal (Similarity: 0.4583)', 'the drama of the gifted child (Similarity: 0.4539)', 'the great gatsby is an american classic following jay gatsby’s quest to win back his long-lost love by faking a successful life,\xa0 (Similarity: 0.4533)']


In [83]:
books = data['book_name'].values[-2:]

In [84]:
for book in books:
  print(book)

zero to one
iwoz
