### Load Dataframes

In [3]:
import pandas as pd
df_books = pd.read_csv('books_clean.csv')
df_authors = pd.read_csv('authors_clean.csv')
df_reviews = pd.read_csv('reviews_clean.csv')

In [4]:
print(df_books.columns) 
df_books.head()

Index(['Title', 'description', 'authors', 'image', 'previewLink', 'publisher',
       'publishedDate', 'infoLink', 'categories', 'main_author', 'genre',
       'review_count', 'avg_rating', 'is_indie'],
      dtype='object')


Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,main_author,genre,review_count,avg_rating,is_indie
0,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],Philip Nel,Biography & Autobiography,9,4.555556,False
1,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,self-published,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],David R Ray,Religion,4,5.0,True
2,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],Veronica Haddon,Fiction,32,3.71875,False
3,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,['Everett Ferguson'],http://books.google.com/books/content?id=kVqRa...,http://books.google.nl/books?id=kVqRaiPlx88C&p...,Wm. B. Eerdmans Publishing,1996,http://books.google.nl/books?id=kVqRaiPlx88C&d...,['Religion'],Everett Ferguson,Religion,4,4.5,False
4,Saint Hyacinth of Poland,The story for children 10 and up of St. Hyacin...,['Mary Fabyan Windeatt'],http://books.google.com/books/content?id=lmLqA...,http://books.google.nl/books?id=lmLqAAAACAAJ&d...,Tan Books & Pub,2009-01-01,http://books.google.nl/books?id=lmLqAAAACAAJ&d...,['Biography & Autobiography'],Mary Fabyan Windeatt,Biography & Autobiography,2,4.5,False


In [5]:
print(df_authors.columns)
df_authors.head()

Index(['main_author', 'total_books', 'total_reviews', 'is_self_published',
       'is_indie'],
      dtype='object')


Unnamed: 0,main_author,total_books,total_reviews,is_self_published,is_indie
0,(Dr) Seuss,1,3,True,True
1,Augustine,1,1,False,False
2,Blizzard Entertainment,1,13,False,False
3,Deiss,1,9,False,False
4,Meystre-Sargent,1,2,False,False


In [6]:
print(df_reviews.columns)
df_reviews.head()

Index(['ISBN', 'Title', 'rating', 'review_text'], dtype='object')


Unnamed: 0,ISBN,Title,rating,review_text
0,826414346,Dr. Seuss: American Icon,5.0,I don't care much for Dr. Seuss but after read...
1,826414346,Dr. Seuss: American Icon,5.0,"If people become the books they read and if ""t..."
2,826414346,Dr. Seuss: American Icon,4.0,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
3,826414346,Dr. Seuss: American Icon,4.0,Philip Nel - Dr. Seuss: American IconThis is b...
4,826414346,Dr. Seuss: American Icon,4.0,"""Dr. Seuss: American Icon"" by Philip Nel is a ..."


## Indie book recommendations with TF‑IDF + cosine similarity

We’ll recommend indie books based on textual similarity of reviews.

High‑level steps:
- Normalize titles to improve joins.
- Aggregate all review texts per book (Title).
- Build a TF‑IDF matrix of these texts.
- Given 2–3 favorite titles, average their TF‑IDF vectors to form a “preference vector”.
- Compute cosine similarity between this vector and all books, then return the top indie titles only.

Notes:
- If a book has no reviews, we’ll optionally fall back to its description to avoid empty text.
- Favorites not found (or with no text) are ignored with a warning.
- We exclude the input favorites from the final results by default.

In [7]:
# Imports for modeling and utilities
import numpy as np
try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    from sklearn.decomposition import TruncatedSVD
    SKLEARN_AVAILABLE = True
except Exception as e:
    SKLEARN_AVAILABLE = False
    print("scikit-learn is not available. Please install it to run the recommender.")
    print("Error:", e)

import re
from typing import List, Tuple

In [8]:
# Prepare aggregated review text per Title and join with books
def _normalize_title(s: str) -> str:
    if not isinstance(s, str):
        return ""
    return re.sub(r"\s+", " ", s.strip().lower())

# Create normalized keys for joining
df_books['Title_norm'] = df_books['Title'].apply(_normalize_title)
df_reviews['Title_norm'] = df_reviews['Title'].apply(_normalize_title)

# Aggregate review texts by Title
agg_reviews = (
    df_reviews
    .dropna(subset=['review_text'])
    .groupby('Title_norm', as_index=False)
    .agg({
        'review_text': lambda s: " \n".join(map(str, s)),
        'rating': 'mean'
    })
    .rename(columns={'rating': 'avg_review_rating_from_reviews'})
)

# Merge with books; keep description and aggregated review text separate
df_books_text = df_books.merge(agg_reviews, on='Title_norm', how='left')
df_books_text['review_text_agg'] = df_books_text['review_text'].fillna("")
df_books_text['desc_text'] = df_books_text['description'].fillna("")

# Optional: enrich review text with description if reviews are sparse (so review side isn't empty)
mask_sparse_reviews = df_books_text['review_text_agg'].str.len() < 50
df_books_text.loc[mask_sparse_reviews, 'review_text_agg'] = (
    df_books_text.loc[mask_sparse_reviews, 'review_text_agg'] +
    " \n" + df_books_text.loc[mask_sparse_reviews, 'desc_text'].astype(str)
)

# Keep only rows with some text on at least description OR reviews
has_any_text = (df_books_text['desc_text'].str.strip().str.len() > 0) | (df_books_text['review_text_agg'].str.strip().str.len() > 0)
df_books_text = df_books_text[has_any_text].reset_index(drop=True)

# Helpers and info
is_indie_mask = df_books_text['is_indie'] == True
print(f"Prepared text for {len(df_books_text)} books; indie count in this set: {int(is_indie_mask.sum())}")

Prepared text for 141755 books; indie count in this set: 17667


In [9]:
import joblib
import json
from pathlib import Path

OUT_DIR = Path("models")
OUT_DIR.mkdir(exist_ok=True)

# Try to load precomputed reduced matrices and pipeline if available.
loaded = False
desc_reduced_path = OUT_DIR / "X_desc_reduced.npy"
rev_reduced_path = OUT_DIR / "X_review_reduced.npy"
npz_path = OUT_DIR / "reduced_matrices.npz"
tfidf_desc_path = OUT_DIR / "tfidf_desc.joblib"
svd_desc_path = OUT_DIR / "svd_desc.joblib"
tfidf_rev_path = OUT_DIR / "tfidf_rev.joblib"
svd_rev_path = OUT_DIR / "svd_rev.joblib"
title_map_path = OUT_DIR / "title_norm_to_row.json"
df_parquet_path = OUT_DIR / "df_books_text.parquet"

if npz_path.exists():
    print("Loading reduced matrices from compressed npz:", npz_path)
    with np.load(npz_path) as data:
        X_desc_reduced = data["X_desc"]
        X_review_reduced = data["X_review"]
    loaded = True
elif desc_reduced_path.exists() and rev_reduced_path.exists():
    print("Loading reduced matrices from .npy files")
    X_desc_reduced = np.load(desc_reduced_path)
    X_review_reduced = np.load(rev_reduced_path)
    loaded = True

if loaded:
    # attempt to load vectorizers + SVDs if present (optional but recommended)
    try:
        if tfidf_desc_path.exists(): tfidf_desc = joblib.load(tfidf_desc_path)
        else: tfidf_desc = None
        if svd_desc_path.exists(): svd_desc = joblib.load(svd_desc_path)
        else: svd_desc = None
        if tfidf_rev_path.exists(): tfidf_rev = joblib.load(tfidf_rev_path)
        else: tfidf_rev = None
        if svd_rev_path.exists(): svd_rev = joblib.load(svd_rev_path)
        else: svd_rev = None
    except Exception as e:
        print("Warning: failed to load some pipeline objects:", e)
        tfidf_desc = tfidf_desc if 'tfidf_desc' in globals() else None
        svd_desc = svd_desc if 'svd_desc' in globals() else None
        tfidf_rev = tfidf_rev if 'tfidf_rev' in globals() else None
        svd_rev = svd_rev if 'svd_rev' in globals() else None

    # load supporting assets if available
    if title_map_path.exists():
        with open(title_map_path, "r") as f:
            title_norm_to_row = json.load(f)
    if df_parquet_path.exists():
        try:
            import pandas as pd
            df_books_text = pd.read_parquet(df_parquet_path)
        except Exception as e:
            print("Warning: failed to load df_books_text.parquet:", e)

    print(f"SVD(desc) shape: {X_desc_reduced.shape}")
    print(f"SVD(rev) shape: {X_review_reduced.shape}")

Loading reduced matrices from .npy files
SVD(desc) shape: (141755, 300)
SVD(rev) shape: (141755, 300)


In [10]:
# Build TF-IDF matrices for description and reviews, then enforce SVD reduction
if not SKLEARN_AVAILABLE:
    raise RuntimeError("scikit-learn is required to build TF-IDF. Please install scikit-learn.")

# TF-IDF on description (heavier weight later)
tfidf_desc = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_df=0.8,
    min_df=5,
    ngram_range=(1,1),
    max_features=50000,
    dtype=np.float32,
)
X_desc = tfidf_desc.fit_transform(df_books_text['desc_text'])
print(f"TF-IDF(desc) shape: {X_desc.shape}")

# TF-IDF on reviews (supportive feature)
tfidf_rev = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_df=0.8,
    min_df=5,
    ngram_range=(1,1),
    max_features=30000,
    dtype=np.float32,
)
X_rev = tfidf_rev.fit_transform(df_books_text['review_text_agg'])
print(f"TF-IDF(rev) shape: {X_rev.shape}")

# Mapping: normalized title -> row index for robust lookups
title_norm_to_row = {t: i for i, t in enumerate(df_books_text['Title_norm'])}

# Enforce SVD dimensionality reduction before recommendations
from sklearn.decomposition import TruncatedSVD
USE_SVD = True
SVD_COMPONENTS = 300

# SVD reduce description
svd_desc = TruncatedSVD(n_components=max(2, min(SVD_COMPONENTS, X_desc.shape[1]-1)), random_state=42)
X_desc_reduced = svd_desc.fit_transform(X_desc)
X_desc_reduced = X_desc_reduced / (np.linalg.norm(X_desc_reduced, axis=1, keepdims=True) + 1e-12)
print(f"SVD(desc) shape: {X_desc_reduced.shape}")

# SVD reduce reviews
svd_rev = TruncatedSVD(n_components=max(2, min(SVD_COMPONENTS, X_rev.shape[1]-1)), random_state=42)
X_review_reduced = svd_rev.fit_transform(X_rev)
X_review_reduced = X_review_reduced / (np.linalg.norm(X_review_reduced, axis=1, keepdims=True) + 1e-12)
print(f"SVD(rev) shape: {X_review_reduced.shape}")

TF-IDF(desc) shape: (141755, 50000)
TF-IDF(rev) shape: (141755, 30000)
SVD(desc) shape: (141755, 300)
SVD(rev) shape: (141755, 300)


In [11]:
# Define recommender with genre constraint and indie boost
GENRE_WEIGHT = 0.6   # genre/description importance dominates
REVIEW_WEIGHT = 0.4  # reviews supportive
INDIE_BOOST = 0.1   # additive boost for indie titles

# Helper: normalize genre tokens
def _norm_genre_list(x):
    if isinstance(x, str):
        s = x.lower()
        parts = re.split(r"[;,]", s)
        return {p.strip() for p in parts if p.strip()}
    if isinstance(x, (list, tuple, set)):
        return {str(p).strip().lower() for p in x if str(p).strip()}
    return set()

def recommend_indie_books(favorite_titles: List[str], top_k: int = 10, exclude_favorites: bool = True) -> pd.DataFrame:
    """
    Recommend books prioritizing genre/description similarity, then reviews; enforce genre match; boost indie.
    If no overlapping genre exists with favorites, fall back to closest overall by combined score.
    """
    # Ensure reductions exist
    for var in ['X_desc_reduced', 'X_review_reduced']:
        if var not in globals() or globals()[var] is None:
            raise RuntimeError("SVD reduction required. Run the TF-IDF + SVD cell first.")

    norm_favs = [_normalize_title(t) for t in favorite_titles]
    indices = [title_norm_to_row.get(t) for t in norm_favs]
    fav_idx = [i for i in indices if i is not None]
    if not fav_idx:
        print("No favorite titles found in the corpus. Check spelling or availability.")
        return pd.DataFrame(columns=['Title','similarity'])

    # Build preference vectors (mean of favorites) and normalize
    pref_desc = X_desc_reduced[fav_idx].mean(axis=0, keepdims=True)
    pref_desc = pref_desc / (np.linalg.norm(pref_desc, axis=1, keepdims=True) + 1e-12)

    pref_rev = X_review_reduced[fav_idx].mean(axis=0, keepdims=True)
    pref_rev = pref_rev / (np.linalg.norm(pref_rev, axis=1, keepdims=True) + 1e-12)

    # Compute cosine similarities via dot
    sims_desc = (pref_desc @ X_desc_reduced.T).ravel()
    sims_rev = (pref_rev @ X_review_reduced.T).ravel()

    # Enforce genre match: only keep books sharing at least one genre term with ANY favorite
    fav_genres_sets = []
    for i in fav_idx:
        fav_genres_sets.append(_norm_genre_list(df_books_text.loc[i, 'categories'] if 'categories' in df_books_text.columns else ''))
    fav_genres_union = set().union(*fav_genres_sets) if fav_genres_sets else set()

    book_genres = df_books_text['categories'].fillna("").apply(_norm_genre_list)
    genre_match_mask = book_genres.apply(lambda g: len(g.intersection(fav_genres_union)) > 0)

    # Combine scores with weights and indie boost
    combined = GENRE_WEIGHT * sims_desc + REVIEW_WEIGHT * sims_rev
    indie_mask = (df_books_text['is_indie'] == True)
    combined = combined + INDIE_BOOST * indie_mask.values.astype(float)

    # Prepare results
    result = df_books_text.copy()
    result['similarity'] = combined

    # Align genre mask to current result and apply; if none matched, fall back to overall similarity
    aligned_mask = genre_match_mask.reindex(result.index, fill_value=False)
    if aligned_mask.any():
        result = result[aligned_mask.to_numpy()]
    else:
        print("No overlapping genres found; falling back to closest overall matches by score.")

    if exclude_favorites:
        result = result[~result['Title_norm'].isin(norm_favs)]

    # Sort by similarity and return top_k
    cols = ['Title','main_author','avg_rating','is_indie','genre','categories','similarity','previewLink','infoLink']
    existing_cols = [c for c in cols if c in result.columns]
    return result.sort_values(by='similarity', ascending=False)[existing_cols].head(top_k)

In [12]:
# Example usage: ensure genre match and indie boost are applied
example_favorites = [
    df_books_text['Title'].iloc[0] if len(df_books_text) > 0 else 'Unknown',
]
print('Favorites used for demo:', example_favorites)
try:
    out = recommend_indie_books(example_favorites, top_k=10)
    display(out)
    print("Indie in results:", int((out['is_indie'] == True).sum()))
except Exception as e:
    print("Recommendation failed:", e)

Favorites used for demo: ['Dr. Seuss: American Icon']


Unnamed: 0,Title,main_author,avg_rating,is_indie,genre,categories,similarity,previewLink,infoLink
99427,Dr Frau,Grace H Kaiser,4.333333,True,Biography & Autobiography,['Biography & Autobiography'],0.391327,http://books.google.com/books?id=5gnVw-m09VAC&...,http://books.google.com/books?id=5gnVw-m09VAC&...
11051,American Thunder : The Garth Brooks Story,Jo Sgammato,4.533333,True,Biography & Autobiography,['Biography & Autobiography'],0.386587,http://books.google.nl/books?id=czUr32l5PDsC&q...,http://books.google.nl/books?id=czUr32l5PDsC&d...
102558,The Fame of a Dead Man's Deeds: An Up-Close Po...,Robert S Griffin,4.769231,True,Biography & Autobiography,['Biography & Autobiography'],0.38636,http://books.google.com/books?id=9Se8wAEACAAJ&...,http://books.google.com/books?id=9Se8wAEACAAJ&...
48241,The Three Roosevelts: Patrician Leaders Who Tr...,James Macgregor Burns,4.0,True,Biography & Autobiography,['Biography & Autobiography'],0.384322,http://books.google.com/books?id=MxOUYEy6iykC&...,https://play.google.com/store/books/details?id...
49308,Satchmo - My Life in New Orleans,Louis Armstrong,4.857143,True,Biography & Autobiography,['Biography & Autobiography'],0.38147,http://books.google.com/books?id=S7TSDQAAQBAJ&...,http://books.google.com/books?id=S7TSDQAAQBAJ&...
128986,Satchmo: My life in new Orleans,Louis Armstrong,4.857143,True,Biography & Autobiography,['Biography & Autobiography'],0.38147,http://books.google.com/books?id=S7TSDQAAQBAJ&...,http://books.google.com/books?id=S7TSDQAAQBAJ&...
86758,America's political dynasties,Stephen Hess,5.0,True,Biography & Autobiography,['Biography & Autobiography'],0.369352,http://books.google.nl/books?id=_fV5rgEACAAJ&d...,http://books.google.nl/books?id=_fV5rgEACAAJ&d...
114421,Remembering Charles Kuralt,Ralph Grizzle,4.666667,True,Biography & Autobiography,['Biography & Autobiography'],0.368745,http://books.google.com/books?id=NP0aAQAAIAAJ&...,http://books.google.com/books?id=NP0aAQAAIAAJ&...
136899,Long March to Freedom: Tom Hargrove's Own Stor...,Thomas R Hargrove,4.0,True,Biography & Autobiography,['Biography & Autobiography'],0.368412,http://books.google.com/books?id=gF7RAAAACAAJ&...,http://books.google.com/books?id=gF7RAAAACAAJ&...
130292,World within world: The autobiography of Steph...,Stephen Spender,4.8,True,Biography & Autobiography,['Biography & Autobiography'],0.367292,http://books.google.com/books?id=wLqLtwEACAAJ&...,http://books.google.com/books?id=wLqLtwEACAAJ&...


Indie in results: 10


In [13]:
OUT_DIR.mkdir(exist_ok=True)

np.save(OUT_DIR / "X_desc_reduced.npy", X_desc_reduced)
np.save(OUT_DIR / "X_review_reduced.npy", X_review_reduced)
joblib.dump(tfidf_desc, OUT_DIR / "tfidf_desc.joblib")
joblib.dump(svd_desc, OUT_DIR / "svd_desc.joblib")
joblib.dump(tfidf_rev, OUT_DIR / "tfidf_rev.joblib")
joblib.dump(svd_rev, OUT_DIR / "svd_rev.joblib")
df_books_text.to_parquet(OUT_DIR / "df_books_text.parquet",
index=False)

title_norm_to_row = {t: i for i, t in
enumerate(df_books_text["Title_norm"])}
with open(OUT_DIR / "title_norm_to_row.json", "w") as f:
  json.dump({k: int(v) for k, v in title_norm_to_row.items()},
f)

OSError: [Errno 28] Error writing bytes to file. Detail: [errno 28] No space left on device