## 2.3

In [None]:
import pandas as pd
data_df = pd.read_csv("processed_data_2.3.csv")

## Generating Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
from tqdm import tqdm

# Check GPU
print("Available device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

# Load embedding model on GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("BAAI/bge-large-en-v1.5", device=device)

In [None]:
model = model.to("cuda")

Each books prediction is enriched with topic labels from BERTopic & extracted keywords from KeyBERT.

In [None]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

model = SentenceTransformer("BAAI/bge-large-en-v1.5")
model = model.to(device)

def combine_text(row):
    base = row["Description"]
    kws = " ".join(row["Topic_Tags"]) if "Topic_Tags" in row else ""
    topic = row["Topic_Label"]
    return f"{base} Keywords: {kws} Topic: {topic}"

data_df["Final_Text"] = data_df.apply(combine_text, axis=1)

embeddings = model.encode(
    data_df["Final_Text"].tolist(),
    batch_size=64,
    show_progress_bar=True,
    normalize_embeddings=True,
    device=device
)

np.save("book_embeddings.npy", embeddings)
print("Saved book_embeddings.npy successfully")


In [None]:
!pip install faiss-cpu

## Building the Semantic Search Index with FAISS

In [None]:
import faiss
import numpy as np

dim = embeddings.shape[1]

index = faiss.IndexFlatIP(dim)  # Cosine similarity (normalized vectors)
index.add(embeddings)

faiss.write_index(index, "book_faiss.index")


## Importing saved embeddings and faiss index

In [None]:
import faiss
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer

# Load embeddings
embeddings = np.load("book_embeddings.npy")
# Load FAISS index
index = faiss.read_index("book_faiss.index")
# Load Dataset
data_df = pd.read_csv("processed_data_2.3.csv")
# query embedding model
query_model = SentenceTransformer("BAAI/bge-large-en-v1.5")

In [None]:
print(embeddings.shape)
print(index.ntotal)

## Testing book queries

In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

def search_books(query, index, data_df, top_k=10):
    # 1. Embed query
    q_emb = query_model.encode(
        [query],
        normalize_embeddings=True,
        device="cuda" if torch.cuda.is_available() else "cpu"
    )

    # 2. FAISS search
    scores, ids = index.search(q_emb, top_k)

    # 3. results DataFrame
    result_rows = []
    for score, idx in zip(scores[0], ids[0]):
        result_rows.append({
            "Book_ID": int(idx),
            "Title": data_df.iloc[idx]["Title"],
            "Authors": data_df.iloc[idx]["Authors"],
            "Description": data_df.iloc[idx]["Description"],
            "Topic": data_df.iloc[idx].get("Topic_Label", None),
            "Score": float(score)
        })
    return pd.DataFrame(result_rows)

In [None]:
pd.set_option('display.max_colwidth', None)
result = search_books("a book about nationalism and bravery", index, data_df, top_k=10)
result["Description"]
#"a cozy magical story about friendship and grief", "a book about nationalism and bravery"

Out of the top 10 returned books:

- 7/10 were highly relevant, matching at least 3 of the 4 target themes
(cozy, magical, friendship, grief)
- 2/10 were moderately relevant (friendship + emotional themes, but not magical)
- 1/10 was an outlier (non-fiction friendship gift book)