In [1]:
!git clone https://github.com/marvalkrystof/Book-Recommender-System.git
%cd Book-Recommender-System
!pip install -r requirements.txt -q
!pip install sentence-transformers -q

Cloning into 'Book-Recommender-System'...
remote: Enumerating objects: 79, done.[K
remote: Counting objects: 100% (79/79), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 79 (delta 31), reused 19 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (79/79), 28.72 MiB | 17.66 MiB/s, done.
Resolving deltas: 100% (31/31), done.
Updating files: 100% (8/8), done.
/content/Book-Recommender-System
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m96.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m142.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np
import json
import ast
import re
import os
import torch
from sentence_transformers import SentenceTransformer, CrossEncoder
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
def find_file(filename):
    for root, _, files in os.walk("."):
        if filename in files: return os.path.join(root, filename)
    return None

DRIVE_FOLDER = "/content/drive/MyDrive/TA2_semeatralka"
CSV_PATH = os.path.join(DRIVE_FOLDER, "processed_data_2.3.csv")
NPY_PATH = os.path.join(DRIVE_FOLDER, "book_embeddings.npy")
QUERY_NB_PATH = find_file("feautre_extraction_test.ipynb")

In [19]:
# Load CSV
if not os.path.exists(CSV_PATH): raise FileNotFoundError(f"Missing CSV at {CSV_PATH}")
df = pd.read_csv(CSV_PATH)
# Parse lists
for col in ['Authors', 'Category']:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Load Embeddings
if not os.path.exists(NPY_PATH): raise FileNotFoundError(f"Missing NPY at {NPY_PATH}")
doc_embeddings = np.load(NPY_PATH)

# Load Queries
queries = []
with open(QUERY_NB_PATH, 'r') as f:
    nb = json.load(f)
for cell in nb['cells']:
    if 'outputs' in cell:
        for out in cell['outputs']:
            if 'text' in out:
                text = "".join(out['text'])
                matches = re.findall(r"Query (\d+): (.*?)\nFilters: (\{.*?\})", text, re.DOTALL)
                for m in matches:
                    try: queries.append({'id': m[0], 'text': m[1].strip(), 'filters': ast.literal_eval(m[2])})
                    except: pass
queries = list({q['id']: q for q in queries}.values())

print(f"Loaded: {len(df)} books, {doc_embeddings.shape} embeddings, {len(queries)} queries.")

Loaded: 67655 books, (26414, 1024) embeddings, 50 queries.


In [20]:
EMBEDDING_MODEL = "BAAI/bge-large-en-v1.5"
RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
TOP_K_RETRIEVAL = 50
TOP_K_RERANK = 10

# Initialize Models
device = "cuda" if torch.cuda.is_available() else "cpu"
q_model = SentenceTransformer(EMBEDDING_MODEL, device=device)
reranker = CrossEncoder(RERANKER_MODEL)

# Encode Queries
q_embeddings = q_model.encode([q['text'] for q in queries], normalize_embeddings=True, show_progress_bar=True)

# Metric Helper
def calc_metrics(retrieved, relevant):
    if not relevant: return 0, 0, 0
    acc = 1 if retrieved[0] in relevant else 0
    p10 = sum(1 for x in retrieved[:10] if x in relevant) / 10.0
    score, hits = 0.0, 0.0
    for i, idx in enumerate(retrieved):
        if idx in relevant:
            hits += 1
            score += hits / (i + 1)
    return score / min(len(relevant), len(retrieved)), acc, p10

# Evaluation Loop
res_base, res_rerank = {'ap': [], 'acc': [], 'p10': []}, {'ap': [], 'acc': [], 'p10': []}

# Stage 1: Vector Search
scores = np.dot(q_embeddings, doc_embeddings.T)

for i, q in enumerate(queries):
    # Ground Truth
    mask = pd.Series([True] * len(df))
    if q['filters'].get('author'):
        mask = mask & df['Authors'].apply(lambda x: any(q['filters']['author'].lower() in a.lower() for a in x))
    if q['filters'].get('categories'):
        targets = [c.lower() for c in q['filters']['categories']]
        mask = mask & df['Category'].apply(lambda cats: any(t in c.lower() for t in targets for c in cats))
    relevant = set(df[mask].index)
    if not relevant: continue

    # Stage 1: Retrieval
    top_50 = scores[i].argsort()[::-1][:TOP_K_RETRIEVAL]
    ap, acc, p10 = calc_metrics(top_50, relevant)
    res_base['ap'].append(ap); res_base['acc'].append(acc); res_base['p10'].append(p10)

    # Stage 2: Reranking
    pairs = []
    for idx in top_50:
        authors_str = ", ".join(df.iloc[idx]['Authors']) if isinstance(df.iloc[idx]['Authors'], list) else str(df.iloc[idx]['Authors'])
        book_text = f"Title: {df.iloc[idx]['Title']} Author: {authors_str} Description: {df.iloc[idx]['Description']}"
        pairs.append([q['text'], book_text])
    rerank_scores = reranker.predict(pairs)

    # Re-sort top 50
    sorted_local = np.argsort(rerank_scores)[::-1]
    top_reranked = [top_50[j] for j in sorted_local]

    ap_r, acc_r, p10_r = calc_metrics(top_reranked, relevant)
    res_rerank['ap'].append(ap_r); res_rerank['acc'].append(acc_r); res_rerank['p10'].append(p10_r)

# Results
metrics_df = pd.DataFrame({
    'Metric': ['MAP', 'Accuracy', 'Precision@10'],
    'Stage 1 (Embeddings)': [np.mean(res_base['ap']), np.mean(res_base['acc']), np.mean(res_base['p10'])],
    'Stage 2 (Reranking)': [np.mean(res_rerank['ap']), np.mean(res_rerank['acc']), np.mean(res_rerank['p10'])]
})
print("Evaluation Results:")
display(metrics_df.round(4))

lift = (np.mean(res_rerank['ap']) - np.mean(res_base['ap'])) / np.mean(res_base['ap']) * 100
print(f"Reranking MAP Lift: {lift:+.2f}%")

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluation Results:


Unnamed: 0,Metric,Stage 1 (Embeddings),Stage 2 (Reranking)
0,MAP,0.2182,0.2092
1,Accuracy,0.4762,0.4762
2,Precision@10,0.381,0.3143


Reranking MAP Lift: -4.11%


In [21]:
print("Based on the evaluation metrics, the Reranking stage (Stage 2) degrades performance.")
print(f"PRECISION DROP: {(0.3143 - 0.3810)/0.3810*100:.2f}%")

Based on the evaluation metrics, the Reranking stage (Stage 2) degrades performance.
PRECISION DROP: -17.51%


In [22]:
# DATA ALIGNMENT & HYBRID SEARCH
# Align Data with Embeddings
if len(df) != doc_embeddings.shape[0]:
    df = df.iloc[:doc_embeddings.shape[0]].copy()

df['search_text'] = df['Title'] + " " + df['Description'].fillna('')

# Hybrid Search (TF-IDF + Embeddings)
# TF-IDF
vec = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_mat = vec.fit_transform(df['search_text'])
q_vec_tfidf = vec.transform([q['text'] for q in queries])
scores_tfidf = cosine_similarity(q_vec_tfidf, tfidf_mat)

# Recalculate Embedding Scores (to match trimmed data)
device = "cuda" if torch.cuda.is_available() else "cpu"
q_model = SentenceTransformer("BAAI/bge-large-en-v1.5", device=device)
q_embeddings = q_model.encode([q['text'] for q in queries], normalize_embeddings=True)
scores_emb = np.dot(q_embeddings, doc_embeddings.T)

# Combine Scores
hybrid_scores = (0.7 * scores_emb) + (0.3 * scores_tfidf)

# Evaluate Hybrid
hyb_res = {'ap': [], 'acc': [], 'p10': []}
for i, q in enumerate(queries):
    mask = pd.Series([True] * len(df))
    if q['filters'].get('author'):
        mask = mask & df['Authors'].apply(lambda x: any(q['filters']['author'].lower() in a.lower() for a in x))
    if q['filters'].get('categories'):
        targets = [c.lower() for c in q['filters']['categories']]
        mask = mask & df['Category'].apply(lambda cats: any(t in c.lower() for t in targets for c in cats))
    relevant = set(df[mask].index)
    if not relevant: continue

    top = hybrid_scores[i].argsort()[::-1][:50]

    # Metrics
    acc = 1 if top[0] in relevant else 0
    p10 = sum(1 for x in top[:10] if x in relevant) / 10.0
    score, hits = 0.0, 0.0
    for rank, idx in enumerate(top):
        if idx in relevant:
            hits += 1
            score += hits / (rank + 1)
    ap = score / min(len(relevant), 50)
    hyb_res['ap'].append(ap); hyb_res['acc'].append(acc); hyb_res['p10'].append(p10)

print("Hybrid Search Results:")
print(f"MAP: {np.mean(hyb_res['ap']):.4f}")
print(f"Accuracy: {np.mean(hyb_res['acc']):.4f}")
print(f"Precision@10: {np.mean(hyb_res['p10']):.4f}")

Hybrid Search Results:
MAP: 0.2218
Accuracy: 0.3684
Precision@10: 0.3263


In [35]:
# Alternative Reranking (BAAI/bge-reranker-base)
NEW_RERANKER = "BAAI/bge-reranker-v2-m3"
print(f"Loading Improved Reranker: {NEW_RERANKER}...")

reranker_bge = CrossEncoder(NEW_RERANKER, device=device)
bge_results = {'ap': [], 'acc': [], 'p10': []}

for i, q in enumerate(queries):
    # Ground Truth Logic
    mask = pd.Series([True] * len(df))
    if q['filters'].get('author'):
        mask = mask & df['Authors'].apply(lambda x: any(q['filters']['author'].lower() in a.lower() for a in x))
    if q['filters'].get('categories'):
        targets = [c.lower() for c in q['filters']['categories']]
        mask = mask & df['Category'].apply(lambda cats: any(t in c.lower() for t in targets for c in cats))

    relevant = set(df[mask].index)
    if not relevant: continue

    # Get Candidates (Top 50 from Hybrid Search)
    top_50 = hybrid_scores[i].argsort()[::-1][:50]

    # Prepare Pairs (Simplified format for v2 model)
    pairs = []
    for idx in top_50:
        # Simple format: Title + Description
        # Truncate to 1000 chars to prevent issues
        text = f"{df.iloc[idx]['Title']}: {df.iloc[idx]['Description']}"
        pairs.append([q['text'], text[:1000]])

    # Predict & Sort
    scores = reranker_bge.predict(pairs)
    sorted_indices = np.argsort(scores)[::-1]
    top_reranked = [top_50[j] for j in sorted_indices]

    # Metrics
    ap, acc, p10 = calc_metrics(top_reranked, relevant)
    bge_results['ap'].append(ap)
    bge_results['acc'].append(acc)
    bge_results['p10'].append(p10)

print("BGE Reranker Results:")
print(f"MAP: {np.mean(bge_results['ap']):.4f}")
print(f"Accuracy: {np.mean(bge_results['acc']):.4f}")
print(f"Precision@10: {np.mean(bge_results['p10']):.4f}")

Loading Improved Reranker: BAAI/bge-reranker-v2-m3...
BGE Reranker Results:
MAP: 0.2345
Accuracy: 0.4211
Precision@10: 0.3737


In [27]:
# Alternative Reranking (BAAI/bge-reranker-base)
NEW_RERANKER = "BAAI/bge-reranker-base"
print(f"Loading Reranker: {NEW_RERANKER}...")

reranker_bge = CrossEncoder(NEW_RERANKER, device=device)
bge_results = {'ap': [], 'acc': [], 'p10': []}

for i, q in enumerate(queries):
    # Ground Truth Logic
    mask = pd.Series([True] * len(df))
    if q['filters'].get('author'):
        mask = mask & df['Authors'].apply(lambda x: any(q['filters']['author'].lower() in a.lower() for a in x))
    if q['filters'].get('categories'):
        targets = [c.lower() for c in q['filters']['categories']]
        mask = mask & df['Category'].apply(lambda cats: any(t in c.lower() for t in cats for c in cats))

    relevant = set(df[mask].index)
    if not relevant: continue

    # Get Candidates (Top 50 from Hybrid Search)
    top_50 = hybrid_scores[i].argsort()[::-1][:50]

    # Prepare Pairs for Reranker
    pairs = []
    for idx in top_50:
        auth = ", ".join(df.iloc[idx]['Authors'])
        text = f"Title: {df.iloc[idx]['Title']} | Author: {auth} | Description: {df.iloc[idx]['Description']}"
        pairs.append([q['text'], text])

    # Predict & Sort
    bge_scores = reranker_bge.predict(pairs)
    sorted_indices = np.argsort(bge_scores)[::-1]
    top_reranked = [top_50[j] for j in sorted_indices]

    # Calculate Metrics
    ap, acc, p10 = calc_metrics(top_reranked, relevant)
    bge_results['ap'].append(ap)
    bge_results['acc'].append(acc)
    bge_results['p10'].append(p10)


print("BGE Reranker Results:")
print(f"MAP: {np.mean(bge_results['ap']):.4f}")
print(f"Accuracy: {np.mean(bge_results['acc']):.4f}")
print(f"Precision@10: {np.mean(bge_results['p10']):.4f}")

Loading Reranker: BAAI/bge-reranker-base...
BGE Reranker Results:
MAP: 0.0476
Accuracy: 0.0476
Precision@10: 0.0476


In [28]:
# Reranking 3 (BAAI/bge-reranker-large - State of the Art)
RERANKER_3 = "BAAI/bge-reranker-large"
print(f"Running Reranker 3 ({RERANKER_3})...")

reranker_large = CrossEncoder(RERANKER_3, device=device)
bge_large_results = {'ap': [], 'acc': [], 'p10': []}

for i, q in enumerate(queries):
    mask = pd.Series([True] * len(df))
    if q['filters'].get('author'):
        mask = mask & df['Authors'].apply(lambda x: any(q['filters']['author'].lower() in a.lower() for a in x))
    if q['filters'].get('categories'):
        targets = [c.lower() for c in q['filters']['categories']]
        mask = mask & df['Category'].apply(lambda cats: any(t in c.lower() for t in targets for c in cats))
    relevant = set(df[mask].index)
    if not relevant: continue

    # Top 50 from Embeddings
    top_50 = scores_emb[i].argsort()[::-1][:50]

    pairs = []
    for idx in top_50:
        auth_str = ", ".join(df.iloc[idx]['Authors'])
        text = f"Title: {df.iloc[idx]['Title']} | Author: {auth_str} | Description: {df.iloc[idx]['Description']}"
        pairs.append([q['text'], text])

    rerank_scores = reranker_large.predict(pairs)
    top_large = [top_50[j] for j in np.argsort(rerank_scores)[::-1]]

    ap_r, acc_r, p10_r = calc_metrics(top_large, relevant)
    bge_large_results['ap'].append(ap_r)
    bge_large_results['acc'].append(acc_r)
    bge_large_results['p10'].append(p10_r)

print("BGE Large Reranker Results:")
print(f"MAP: {np.mean(bge_large_results['ap']):.4f}")
print(f"Accuracy: {np.mean(bge_large_results['acc']):.4f}")
print(f"Precision@10: {np.mean(bge_large_results['p10']):.4f}")

Running Reranker 3 (BAAI/bge-reranker-large)...


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

BGE Large Reranker Results:
MAP: 0.2523
Accuracy: 0.5263
Precision@10: 0.3947


In [43]:
# Final Comprehensive Comparison
metrics = {
    'Embeddings': res_base,
    'MiniLM Reranker': res_rerank,
    'Hybrid': hyb_res,
    'BGE Reranker': bge_results,
    'BGE Large': bge_large_results
}

summary = pd.DataFrame({
    'Metric': ['MAP', 'Accuracy', 'Precision@10'],
    'Embeddings (Baseline)': [
        np.mean(metrics['Embeddings']['ap']), np.mean(metrics['Embeddings']['acc']), np.mean(metrics['Embeddings']['p10'])
    ],
    'MiniLM (Rerank 1)': [
        np.mean(metrics['MiniLM Reranker']['ap']), np.mean(metrics['MiniLM Reranker']['acc']), np.mean(metrics['MiniLM Reranker']['p10'])
    ],
    'Hybrid (TF-IDF + Embeddings)': [
        np.mean(metrics['Hybrid']['ap']), np.mean(metrics['Hybrid']['acc']), np.mean(metrics['Hybrid']['p10'])
    ],
    'BGE Base (Rerank 2)': [
        np.mean(metrics['BGE Reranker']['ap']), np.mean(metrics['BGE Reranker']['acc']), np.mean(metrics['BGE Reranker']['p10'])
    ],
    'BGE Large (Rerank 3)': [
        np.mean(metrics['BGE Large']['ap']), np.mean(metrics['BGE Large']['acc']), np.mean(metrics['BGE Large']['p10'])
    ]
})

# Formatting
print("\n  Final Model Performance Comparison")
print(summary.round(4).to_string(index=False))

# Winner Logic
best_model = summary.set_index('Metric').loc['MAP'].idxmax()
print(f"\n🏆 Best Performing Model: {best_model}")


  Final Model Performance Comparison
      Metric  Embeddings (Baseline)  MiniLM (Rerank 1)  Hybrid (TF-IDF + Embeddings)  BGE Base (Rerank 2)  BGE Large (Rerank 3)
         MAP                 0.2182             0.2092                        0.2218               0.2345                0.2523
    Accuracy                 0.4762             0.4762                        0.3684               0.4211                0.5263
Precision@10                 0.3810             0.3143                        0.3263               0.3737                0.3947

🏆 Best Performing Model: BGE Large (Rerank 3)
