In [1]:
from pathlib import Path
import time, random, re, requests, pandas as pd

PROJECT_DIR = Path.cwd()  
print("Project dir:", PROJECT_DIR)


GUTENBERG_CSV = PROJECT_DIR / "gutenberg_books.csv"
OPENLIB_CSV   = PROJECT_DIR / "openlibrary_books.csv"
MERGED_CSV    = PROJECT_DIR / "books_merged.csv"

pd.set_option("display.max_colwidth", 160)


Project dir: C:\Users\Lenovo\Documents\Cursos\DataAnalysis-IronHack\Week10\Day5\Book_recommender_project


In [2]:
from bs4 import BeautifulSoup

BASE = "https://www.gutenberg.org"
CATEGORIES_URL = f"{BASE}/ebooks/categories"
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; book-recs-class/1.0)"}

def get_category_map():
    """Return [{'main': 'Literature', 'sub': 'Adventure', 'url': 'https://.../bookshelf/###'}, ...]"""
    r = requests.get(CATEGORIES_URL, headers=HEADERS, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    cat_map = []
    for h2 in soup.select("h2"):
        main = h2.get_text(strip=True)
        ul = h2.find_next_sibling("ul")
        if not ul:
            continue
        for a in ul.select("a[href^='/ebooks/bookshelf/']"):
            sub = a.get_text(strip=True)
            href = a.get("href")
            if href:
                cat_map.append({"main": main, "sub": sub, "url": BASE + href})
    return cat_map

def iter_books_from_bookshelf(bookshelf_url, max_pages=5, pause=(0.6, 1.0)):
    """
    Yield {'title','author','book_url'} from a bookshelf, paginating by 25.
    Uses .booklink when present, else falls back to simple <li><a> lists.
    """
    start_index = 1
    pages_left = max_pages
    while pages_left > 0:
        url = f"{bookshelf_url}?sort_order=downloads&start_index={start_index}"
        r = requests.get(url, headers=HEADERS, timeout=30)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        rows = []

        blocks = soup.select(".booklink")
        if blocks:
            for blk in blocks:
                a = blk.find("a", href=True)
                if not a: continue
                href = a["href"]
                title_el = blk.select_one(".title")
                author_el = blk.select_one(".subtitle")
                title = title_el.get_text(strip=True) if title_el else None
                author = author_el.get_text(strip=True) if author_el else None
                rows.append({"href": href, "title": title, "author": author})
        else:
            anchors = soup.select("li a[href^='/ebooks/']")
            for a in anchors:
                href = a.get("href", "")
                if not href: continue
                # keep only numeric /ebooks/{id}
                if not href.rstrip("/").split("/")[-1].isdigit():
                    continue
                text = a.get_text(" ", strip=True)
                title, author = text, None
                for sep in [" — ", " by "]:
                    if sep in text:
                        left, right = text.split(sep, 1)
                        title, author = left.strip(), right.strip()
                        break
                rows.append({"href": href, "title": title, "author": author})

        if not rows:
            break

        for b in rows:
            yield {
                "title": b["title"],
                "author": b["author"],
                "book_url": BASE + b["href"]
            }

        start_index += 25
        pages_left -= 1
        time.sleep(random.uniform(*pause))


In [3]:
random.seed(2025)

cat_map = get_category_map()
cats_shuffled = pd.DataFrame(cat_map).sample(frac=1.0, random_state=2025).to_dict(orient="records")

target = 650            
per_subcat_max = 20     
max_pages_per_subcat = 3

rows = []
seen_urls = set()

for cat in cats_shuffled:
    taken = 0
    try:
        for item in iter_books_from_bookshelf(cat["url"], max_pages=max_pages_per_subcat, pause=(0.7, 1.2)):
            u = item["book_url"]
            if u in seen_urls:
                continue
            rows.append({
                "main_category": cat["main"],
                "subcategory":  cat["sub"],
                "title":        item["title"],
                "author":       item["author"],
                "url":          u,
            })
            seen_urls.add(u)
            taken += 1
            if taken >= per_subcat_max or len(rows) >= target:
                break
    except Exception as e:
        print(f"Skipping {cat['main']} → {cat['sub']} due to error: {e}")
    if len(rows) >= target:
        break

gutenberg_df = pd.DataFrame(rows).drop_duplicates(subset=["url"]).reset_index(drop=True)


for c in ["title", "author", "main_category", "subcategory"]:
    if c in gutenberg_df:
        gutenberg_df[c] = (
            gutenberg_df[c].astype(str)
            .str.replace(r"\s+", " ", regex=True)
            .str.strip()
            .replace({"None": None})
        )

print("Gutenberg unique:", len(gutenberg_df))
display(gutenberg_df.head(10))

# ensure ≥500
assert len(gutenberg_df) >= 500, "Gutenberg collection < 500. Increase target/max_pages/per_subcat_max."

gutenberg_df.to_csv(GUTENBERG_CSV, index=False)
print("Saved:", GUTENBERG_CSV)


Gutenberg unique: 650


Unnamed: 0,main_category,subcategory,title,author,url
0,Education & Reference,Teaching & Education,Self Knowledge and Guide to Sex Instruction: Vital Facts of Life for All Ages,Thomas W. Shannon,https://www.gutenberg.org/ebooks/47947
1,Education & Reference,Teaching & Education,Sex-education,Maurice A. Bigelow,https://www.gutenberg.org/ebooks/31352
2,Education & Reference,Teaching & Education,Confidential Chats with Girls,William Lee Howard,https://www.gutenberg.org/ebooks/57551
3,Education & Reference,Teaching & Education,Helps to Latin Translation at Sight,Edmund Luce,https://www.gutenberg.org/ebooks/28890
4,Education & Reference,Teaching & Education,The Sexual Life of the Child,Albert Moll,https://www.gutenberg.org/ebooks/28402
5,Education & Reference,Teaching & Education,"Sex in Education; or, A Fair Chance for Girls",Edward H. Clarke,https://www.gutenberg.org/ebooks/18504
6,Education & Reference,Teaching & Education,Heath's Modern Language Series: The Spanish American Reader (Spanish),Ernesto Nelson,https://www.gutenberg.org/ebooks/39647
7,Education & Reference,Teaching & Education,The reader's guide to the Encyclopaedia Britannica : A handbook containing sixty-six courses of systematic study or occasional reading,Inc. Encyclopaedia Britannica,https://www.gutenberg.org/ebooks/74039
8,Education & Reference,Teaching & Education,1001 задача для умственного счета (Russian),Sergei Aleksandrovich Rachinskii,https://www.gutenberg.org/ebooks/16527
9,Education & Reference,Teaching & Education,How We Think,John Dewey,https://www.gutenberg.org/ebooks/37423


Saved: C:\Users\Lenovo\Documents\Cursos\DataAnalysis-IronHack\Week10\Day5\Book_recommender_project\gutenberg_books.csv


In [4]:
OL_BASE  = "https://openlibrary.org"
OL_HEADERS = {"User-Agent": "BookRecsClass/1.0 (+educational use)"}

def subject_slug(s: str) -> str:
    s = (s or "").strip().lower()
    s = re.sub(r"[^a-z0-9]+", "-", s)
    s = re.sub(r"-{2,}", "-", s).strip("-")
    return s

def parse_subject_work(w: dict):
    title = w.get("title")
    authors = ", ".join([a.get("name","") for a in (w.get("authors") or []) if isinstance(a, dict)]) or None
    key = w.get("key")  # "/works/OLxxxW"
    url = f"{OL_BASE}{key}" if key else None
    return title, authors, url

def parse_search_doc(d: dict):
    title = d.get("title")
    authors = ", ".join(d.get("author_name") or []) or None
    key = d.get("key")
    if not key:
        for s in d.get("seed") or []:
            if isinstance(s, str) and s.startswith("/works/"):
                key = s
                break
    url = f"{OL_BASE}{key}" if key else None
    return title, authors, url

def request_json(url, params=None, pause=(0.5, 1.0)):
    r = requests.get(url, params=params or {}, headers=OL_HEADERS, timeout=30)
    r.raise_for_status()
    time.sleep(random.uniform(*pause))
    return r.json()

def fetch_by_subject_slug(slug: str, limit=50, offset=0):
    url = f"{OL_BASE}/subjects/{slug}.json"
    try:
        data = request_json(url, params={"limit": limit, "offset": offset})
    except Exception:
        return []
    out = []
    for w in data.get("works", []):
        t, a, u = parse_subject_work(w)
        if t and u:
            out.append((t, a, u))
    return out

def fetch_by_search_subject(text: str, limit=50, page=1):
    url = f"{OL_BASE}/search.json"
    try:
        data = request_json(url, params={"subject": text, "limit": limit, "page": page})
    except Exception:
        return []
    out = []
    for d in data.get("docs", []):
        t, a, u = parse_search_doc(d)
        if t and u:
            out.append((t, a, u))
    return out

def fetch_by_search_q(text: str, limit=50, page=1):
    url = f"{OL_BASE}/search.json"
    try:
        data = request_json(url, params={"q": text, "limit": limit, "page": page})
    except Exception:
        return []
    out = []
    for d in data.get("docs", []):
        t, a, u = parse_search_doc(d)
        if t and u:
            out.append((t, a, u))
    return out

In [5]:

g_df = pd.read_csv(GUTENBERG_CSV)

cat_pairs = (
    g_df[["main_category", "subcategory"]]
    .dropna()
    .drop_duplicates()
    .sample(frac=1.0, random_state=42) 
    .to_dict(orient="records")
)

ol_rows = []
seen = set()
target_ol = 650         
per_pair_max = 18       

for pair in cat_pairs:
    main_cat = pair["main_category"]
    sub_cat  = pair["subcategory"]
    slug     = subject_slug(sub_cat)

    taken = 0

    
    batch = fetch_by_subject_slug(slug, limit=min(per_pair_max, 50), offset=0)
    for t, a, u in batch:
        if u in seen: continue
        ol_rows.append({"main_category": main_cat, "subcategory": sub_cat, "title": t, "author": a, "url": u})
        seen.add(u); taken += 1
        if taken >= per_pair_max or len(ol_rows) >= target_ol:
            break

    
    if taken < per_pair_max and len(ol_rows) < target_ol:
        need = per_pair_max - taken
        batch = fetch_by_search_subject(sub_cat, limit=min(need, 50), page=1)
        for t, a, u in batch:
            if u in seen: continue
            ol_rows.append({"main_category": main_cat, "subcategory": sub_cat, "title": t, "author": a, "url": u})
            seen.add(u); taken += 1
            if taken >= per_pair_max or len(ol_rows) >= target_ol:
                break

   
    if taken < per_pair_max and len(ol_rows) < target_ol:
        need = per_pair_max - taken
        batch = fetch_by_search_q(sub_cat, limit=min(need, 50), page=1)
        for t, a, u in batch:
            if u in seen: continue
            ol_rows.append({"main_category": main_cat, "subcategory": sub_cat, "title": t, "author": a, "url": u})
            seen.add(u); taken += 1
            if taken >= per_pair_max or len(ol_rows) >= target_ol:
                break

    if len(ol_rows) >= target_ol:
        break

ol_df = pd.DataFrame(ol_rows).drop_duplicates(subset=["url"]).reset_index(drop=True)


for c in ["title", "author", "main_category", "subcategory"]:
    if c in ol_df:
        ol_df[c] = (
            ol_df[c].astype(str)
            .str.replace(r"\s+", " ", regex=True)
            .str.strip()
            .replace({"None": None})
        )

print("Open Library unique:", len(ol_df))
display(ol_df.head(10))

assert len(ol_df) >= 500, "Open Library collection < 500. Increase target_ol/per_pair_max."

ol_df.to_csv(OPENLIB_CSV, index=False)
print("Saved:", OPENLIB_CSV)

Open Library unique: 559


Unnamed: 0,main_category,subcategory,title,author,url
0,Lifestyle & Hobbies,Sports/Hobbies,The American Girl's Handy Book,"Lina Beard, Adelia Beard",https://openlibrary.org/works/OL1794792W
1,Lifestyle & Hobbies,Sports/Hobbies,Super Mario,Jeff Ryan,https://openlibrary.org/works/OL16116640W
2,Lifestyle & Hobbies,Sports/Hobbies,The Book of Wizard Craft,"Janice Eaton Kilby, Deborah Morgenthal, Terry Taylor",https://openlibrary.org/works/OL5962699W
3,Lifestyle & Hobbies,Sports/Hobbies,"Fox Tossing, Octopus Wrestling and Other Forgotten Sports",Edward Brooke-Hitching,https://openlibrary.org/works/OL24143375W
4,Lifestyle & Hobbies,Sports/Hobbies,Let's play Pokémon : trading card game : the ultimate manual for parents & kids,"Wizards of the Coast, Brown",https://openlibrary.org/works/OL18456863W
5,Lifestyle & Hobbies,Sports/Hobbies,The arts of sport and recreation,Derek Stanford,https://openlibrary.org/works/OL4902450W
6,Lifestyle & Hobbies,Sports/Hobbies,Qualitative Research in Gambling,Rebecca Cassidy,https://openlibrary.org/works/OL17575819W
7,Lifestyle & Hobbies,Sports/Hobbies,Sportscard counterfeit detector,"Robert F. Lemke, Bob Lemke, Sally Grace",https://openlibrary.org/works/OL1719902W
8,Lifestyle & Hobbies,Sports/Hobbies,Unbored,"Elizabeth Foy Larsen, Joshua Glenn",https://openlibrary.org/works/OL16617811W
9,Lifestyle & Hobbies,Sports/Hobbies,The big book of plants,"John Stringer, John Stringer",https://openlibrary.org/works/OL12073391W


Saved: C:\Users\Lenovo\Documents\Cursos\DataAnalysis-IronHack\Week10\Day5\Book_recommender_project\openlibrary_books.csv


In [6]:
g = pd.read_csv(GUTENBERG_CSV)
o = pd.read_csv(OPENLIB_CSV)

g["source"] = "gutenberg"
o["source"] = "openlibrary"

cols = ["main_category","subcategory","title","author","url","source"]
g = g[cols]
o = o[cols]

merged = pd.concat([g, o], ignore_index=True)


merged = merged.drop_duplicates(subset=["url"]).reset_index(drop=True)


def norm(s):
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9]+", "", s)
    return s

merged["title_norm"]  = merged["title"].map(norm)
merged["author_norm"] = merged["author"].map(norm)


merged = merged.sort_values(["title_norm","author_norm","source"])
merged = merged.drop_duplicates(subset=["title_norm","author_norm"], keep="first").reset_index(drop=True)
merged = merged.drop(columns=["title_norm","author_norm"])

print("Merged count:", len(merged))
display(merged.sample(10, random_state=7))

merged.to_csv(MERGED_CSV, index=False)
print("Saved:", MERGED_CSV)

Merged count: 1171


Unnamed: 0,main_category,subcategory,title,author,url,source
893,Social Sciences & Society,Politics,The intelligent woman's guide to socialism and capitalism,Bernard Shaw,https://www.gutenberg.org/ebooks/75859,gutenberg
841,History,History - Royalty,The French Revolution: A History,Thomas Carlyle,https://www.gutenberg.org/ebooks/1301,gutenberg
128,Health & Medicine,Drugs/Alcohol/Pharmacology,"Beer, its history and its economic value as a national beverage",F. W. Salem,https://www.gutenberg.org/ebooks/76262,gutenberg
513,Literature,American Literature,"Moby Dick; Or, The Whale",Herman Melville,https://www.gutenberg.org/ebooks/2701,gutenberg
595,History,History - Royalty,Palace Papers,Tina Brown,https://openlibrary.org/works/OL26355405W,openlibrary
336,Arts & Culture,Architecture,Frank Lloyd Wright,Frank Lloyd Wright,https://openlibrary.org/works/OL961860W,openlibrary
896,Education & Reference,Journals,"The International Monthly, Volume 3, No. 1, April, 1851",Various,https://www.gutenberg.org/ebooks/25325,gutenberg
355,Social Sciences & Society,Gender & Sexuality Studies,Greek Homosexuality,Kenneth J. Dover,https://openlibrary.org/works/OL2623339W,openlibrary
168,History,History - Royalty,宋史 (Chinese),Tuotuo,https://www.gutenberg.org/ebooks/24183,gutenberg
652,Literature,British Literature,"Right Ho, Jeeves",P. G. Wodehouse,https://www.gutenberg.org/ebooks/10554,gutenberg


Saved: C:\Users\Lenovo\Documents\Cursos\DataAnalysis-IronHack\Week10\Day5\Book_recommender_project\books_merged.csv


In [7]:
print("By source:")
display(merged["source"].value_counts())

print("Top categories:")
display(merged.groupby(["main_category","subcategory"]).size().sort_values(ascending=False).head(15))

By source:


source
gutenberg      646
openlibrary    525
Name: count, dtype: int64

Top categories:


main_category              subcategory                     
Arts & Culture             Fashion                             38
Social Sciences & Society  Old Age & the Elderly               38
Education & Reference      Journals                            38
                           Reports & Conference Proceedings    38
Social Sciences & Society  Law & Criminology                   38
                           Gender & Sexuality Studies          38
Health & Medicine          Nutrition                           38
History                    Archaeology & Anthropology          38
Lifestyle & Hobbies        Sports/Hobbies                      38
Arts & Culture             Architecture                        37
History                    History - Royalty                   37
Science & Technology       Engineering & Technology            37
Literature                 Novels                              37
                           Crime, Thrillers & Mystery          37
Social Sciences 

In [10]:
import streamlit as st

In [None]:
!streamlit run app.py