# Step 1: Scrape Books Using Open Library API
- get by 'Subject' name ->> can be anything, e.g. "fantasy" etc.



In [17]:
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pandas as pd
import numpy as np
import concurrent.futures
import csv
from tqdm import tqdm

In [2]:
def get_books_by_subject(subject, limit=100, details=True, ebooks=False, published_in=None, offset=0):
    '''
    Args:
    details: if True, includes related subjects, prolific authors, and publishers.
    ebooks: if True,  filters for books with e-books.
    published_in: filters by publication year.
                  For example:
                  http://openlibrary.org/subjects/love.json?published_in=1500-1600
    limit: num of works to include in the response, controls pagination.
    offset: starting offset in the total works, controls pagination.
    '''
    # Creates the API endpoint URL using the subject provided.
    base_url = (f'https://openlibrary.org/subjects/{subject}.json?limit=1')


    # Sends an HTTP GET request to Open Library's API with the query parameters
    # stored in params.
    # The response is stored in response, which contains JSON data.
    response = requests.get(base_url)#, params=params)

    if response.status_code != 200:
        print(f"Error fetching books for {subject}")
        return []

    data = response.json()
    books = data.get("works", [])

    if not books:
        print(f"No books found for {subject}")
        return []

    book_list = []
    for book in books:
        title = book.get("title", "Unknown Title")
        author = book["authors"][0]["name"] if book.get("authors") else "Unknown Author"
        published_year = book.get("first_publish_year", "Unknown Year")

        # Other details we may need for
        subjects = ", ".join(book.get("subject", ["No subjects available"]))
        description = book.get("description", "No description available")
        ebook_available = book.get("ebook_count_i", 0) > 0
        publishers = ", ".join(book.get("publishers", ["Unknown Publisher"]))

        book_list.append({
            "title": title,
            "author": author,
            "published_year": published_year,
            "subjects": subjects,
            "description": description,
            "ebook_available": ebook_available,
            "publishers": publishers
        })

    return book_list  # Return the list of books


# Step 2: Combine into one genre
For instance, "sci-fi" subject and "science-fiction" subject returns different results. So, our next objective is to combine all of them into one genre "Science Fiction". The same goes for other genres like "Romance" or "Fantasy".


### Function to combine sub-genres into a big genre:

In [3]:
def combine_genre(subject):
    """
    Args:
    subject: book subject

    This function collects book lists under sub-genres and combines them into
    one main genre.

    Returns:
    List of all books under a specific genre.
    """

    if subject is None:
        raise ValueError("Please pass a subject name.")

    # Dictionary of genres and their corresponding lists with adjusted formatting
    genre_dict = {
        "romance": [
            "fiction_romance_general", "fiction_romance_historical_general",
            "romance", "man_woman_relationships", "fiction_romance_suspense",
            "fiction_romance_contemporary",
            "fiction_romance_erotica", "fiction_romance_erotic",
            "marriage_fiction", "fiction_erotica_general", "romance",
            "fiction_christian_romance_general", "fiction_romance_historical"
        ],
        "fantasy": [
            "fiction", "fantasy_fiction", "magic", "fiction_fantasy_general",
            "adventure_and_adventurers_fiction",
            "adventure_and_adventurers", "good_and_evil", "fairies", "dragons",
            "cartoons_and_comics", "witchcraft", "history", "wizards", "fairies_fiction"
        ],
        "historical_fiction": [
            "fiction", "historical_fiction", "history", "fiction_historical_general",
            "fiction_romance_historical_general", "fiction_historical", "fiction_general",
            "fiction_romance_historical", "world_war_1939_1945", "great_britain_fiction"
        ],
        "horror": [
            "fiction", "horror", "horror_stories", "horror_tales", "american_horror_tales",
            "horror_fiction", "detective_and_mystery_stories", "crime", "catalepsy", "murder",
            "burial_vaults"
        ],
        "humor": [
            "anecdotes", "humor_general", "american_wit_and_humor",
            "wit_and_humor", "humour", "humor", "funny"
        ],
        "literature": [
            "philosophy", "in_literature", "theory", "criticism", "criticism_and_interpretation",
            "english_literature", "modern_literature", "american_literature",
            "literature", "litterature"
        ],
        "mystery_thriller": [
            "detective_and_mystery_stories", "mystery_fiction", "murder", "mystery",
            "thriller", "detective", "fiction_thrillers_general",
            "suspense", "fiction_thrillers_suspense", "fiction_suspense",
            "mystery", "thriller", "murder",
            "fiction_thrillers_espionage", "police",
            "suspense_fiction", "fiction_general", "detective_and_mystery_stories",
            "crimes_against", "fiction_psychological", "investigation"
        ],
        "science_fiction": [
            "science_fiction", "fiction_science_fiction_general", "american_science_fiction",
            "extraterrestrial_beings", "life_on_other_planets", "extraterrestrial_beings_fiction",
            "time_travel", "sci_fi", "sci-fi", "science-fiction"
        ]
    }

    if subject not in genre_dict:
        raise ValueError("Invalid genre. Please choose from the predefined genres: \
        romance, fantasy, historical_fiction, horror, humor, literature, \
        mystery_thriller, science_fiction.")

    books_under_genre = []
    seen_books = set()  # To store unique books
    i = 1
    print(f"\nBooks under the genre '{subject}':\n")

    for sub_genre in genre_dict[subject]:
        books = get_books_by_subject(sub_genre)  # Get books for the sub-genre

        if books:
            for book in books:
                # Extract the book title and author for uniqueness check)
                title_author = book['author']
                if title_author is None:
                  print("no author")

                # Ensure no dupicates
                if title_author not in seen_books:
                    print(f"{i}. {book['title']} by {book['author']}")
                    books_under_genre.append(book)
                    seen_books.add(title_author)
                    i += 1

        else:
            print(f"No books found for sub-genre '{sub_genre}'")

    return books_under_genre


### Romance books:

In [4]:
romance_books = combine_genre("romance")


Books under the genre 'romance':

1. Pride and Prejudice by Jane Austen
2. Wuthering Heights by Emily Brontë
3. Is he lying to you? by Dan Crum
4. Rebecca by Daphne du Maurier
5. Loving by Danielle Steel
6. Fifty Shades Freed by E. L. James
7. Memoirs of Fanny Hill by John Cleland
8. Decamerone by Giovanni Boccaccio
9. Far From the Madding Crowd by Thomas Hardy


### Fantasy books:

In [5]:
fantasy_books = combine_genre("fantasy")


Books under the genre 'fantasy':

1. Pride and Prejudice by Jane Austen
2. Alice's Adventures in Wonderland by Lewis Carroll
3. The Marvelous Land of Oz by L. Frank Baum
4. Five Children and It by Edith Nesbit
5. A Christmas Carol by Charles Dickens
6. Harry Potter and the Chamber of Secrets by J. K. Rowling


### Historical Fiction books:

In [6]:
combine_genre("historical_fiction")


Books under the genre 'historical_fiction':

1. Pride and Prejudice by Jane Austen
2. A Christmas Carol by Charles Dickens
3. Alice's Adventures in Wonderland by Lewis Carroll
4. The 12th SS by Meyer, Hubert


[{'title': 'Pride and Prejudice',
  'author': 'Jane Austen',
  'published_year': 1813,
  'subjects': 'Fiction, Romance, Historical, Regency, British and irish fiction (fictional works by one author), Brothers and sisters, Courtship, Drama, English fiction, English literature, Families, Family, Family life, Family relations, Fiction Classics, History, Interpersonal relations, Juvenile fiction, Literary Fiction, love stories, manners, Manners and customs, marriage, Sisters, Social classes, Social life and customs, Upper class, wealth, Women in England, young ladies, Young women, entail, Young women, fiction, Sisters, fiction, Fiction, family life, general, England, fiction, Man-woman relationships, fiction, Roman anglais, Classes sociales, Romans, nouvelles, Sœurs, Amours, Jeunes femmes, Familles, Mœurs et coutumes", Clases sociales, Novela, Jóvenes (Mujeres), Cortejo amoroso, Hermanas, Cuentos de amor, Fiction, coming of age, Fiction, romance, general, Great britain, social life and cus

### Horror Books

In [7]:
combine_genre("horror")


Books under the genre 'horror':

1. Pride and Prejudice by Jane Austen
2. The Picture of Dorian Gray by Oscar Wilde
3. Frankenstein or The Modern Prometheus by Mary Shelley
4. Carrie by Stephen King
5. A Study in Scarlet by Arthur Conan Doyle
6. The Works of Edgar Allan Poe in Five Volumes by Edgar Allan Poe


[{'title': 'Pride and Prejudice',
  'author': 'Jane Austen',
  'published_year': 1813,
  'subjects': 'Fiction, Romance, Historical, Regency, British and irish fiction (fictional works by one author), Brothers and sisters, Courtship, Drama, English fiction, English literature, Families, Family, Family life, Family relations, Fiction Classics, History, Interpersonal relations, Juvenile fiction, Literary Fiction, love stories, manners, Manners and customs, marriage, Sisters, Social classes, Social life and customs, Upper class, wealth, Women in England, young ladies, Young women, entail, Young women, fiction, Sisters, fiction, Fiction, family life, general, England, fiction, Man-woman relationships, fiction, Roman anglais, Classes sociales, Romans, nouvelles, Sœurs, Amours, Jeunes femmes, Familles, Mœurs et coutumes", Clases sociales, Novela, Jóvenes (Mujeres), Cortejo amoroso, Hermanas, Cuentos de amor, Fiction, coming of age, Fiction, romance, general, Great britain, social life and cus

### Humor Books

In [8]:
combine_genre("humor")


Books under the genre 'humor':

1. The Second Jungle Book by Rudyard Kipling
2. Candide by Voltaire
3. Three Men in a Boat (to say nothing of the dog) by Jerome Klapka Jerome
4. Adventures of Huckleberry Finn by Mark Twain
5. Alice's Adventures in Wonderland by Lewis Carroll
6. The BFG by Roald Dahl


[{'title': 'The Second Jungle Book',
  'author': 'Rudyard Kipling',
  'published_year': 1887,
  'subjects': "Romans, Enfants sauvages, Jungles, Faune de la jungle, Fiction, Jungle animals, Mowgli (Fictitious character), Animals, Legends and stories of Animals, Juvenile fiction, Folklore, Anecdotes, Legends, Short stories, Childrens stories, Juvenile literature, English Adventure stories, Boys, Feral children, British and irish fiction (fictional works by one author), English literature, English fiction, Jungle, English Short stories, Children's stories, English, Classic Literature, Fiction, action & adventure, Fables, Children: Grades 4-6, Children's fiction, Jungle animals, fiction, Jungles, fiction, Mowgli (fictitious character), fiction, India, fiction, Animals, fiction, Fiction, general, Fiction, short stories (single author), Biography, Animaux, Romans, nouvelles, etc. pour la jeunesse, Romans, nouvelles",
  'description': 'No description available',
  'ebook_available': False,
  

### Literature Books

In [9]:
combine_genre("literature")


Books under the genre 'literature':

1. The Art of War by Sun Tzu
2. Bible by Bible
3. La Poetica by Aristotle
4. The Merchant of Venice by William Shakespeare
5. Alice's Adventures in Wonderland by Lewis Carroll
6. Pride and Prejudice by Jane Austen
7. Don Quixote by Miguel de Cervantes Saavedra
8. Adventures of Huckleberry Finn by Mark Twain
9. Literacy for the 21st Century by Gail E. Tompkins


[{'title': 'The Art of War',
  'author': 'Sun Tzu',
  'published_year': 1900,
  'subjects': "Open Library Staff Picks, Early works to 1800, Military art and science, great_books_of_the_western_world, Business, Self-Improvement, Philosophy, open_syllabus_project, Fiction, Nonfiction, Politics, Military, Classic Literature, History, Military art and science, early works to 1800, Strategy, Management, Military art and science, history, Leadership, War, Success in business, Industrial management, Marketing, Ciencia militar, Sales promotion, Obras anteriores a 1800, Éxito en los negocios, Aptitudes de mando, Sales management, Executive ability, Tangut language, Art et science militaire, Parent and teenager, Criticism and interpretation, Competition, Parenting, Administración, Long Now Manual for Civilization, Exito en los negocios, Warfare & Defence, Military Science, Eastern, Eastern - General, BG-HISTORY - BG-WAR/MILITARY TRANSPORT, History / Military / Strategy, Military - General, Busin

### Mystery/Thriller Books

In [10]:
combine_genre("mystery_thriller")


Books under the genre 'mystery_thriller':

1. A Study in Scarlet by Arthur Conan Doyle
2. Treasure Island by Robert Louis Stevenson
3. Murder on the Orient Express by Agatha Christie
4. A Christmas Carol by Charles Dickens
5. The Thirty-Nine Steps by John Buchan
6. The Moonstone by Wilkie Collins
7. Alice's Adventures in Wonderland by Lewis Carroll
8. The Da Vinci Code by Dan Brown
9. Wuthering Heights by Emily Brontë


[{'title': 'A Study in Scarlet',
  'author': 'Arthur Conan Doyle',
  'published_year': 1887,
  'subjects': "Action & Adventure Fiction, aortic aneurysm, Battle of Maiwand, Blessing and cursing, British and irish fiction (fictional works by one author), Children's fiction, Classics, crime scenes, Detective and mystery stories, Detective Fiction, Detectives, Detectives privados, Detektive, Détectives, English Detective and mystery stories, English literature, Ficción, Fiction, hemoglobin, Investigadores privados, Krimi, Literature and fiction, mystery and suspense, Murder, Mystery and detective stories, Mystery fiction, Novela, Novela de misterio, pioneers, Private investigators, Scottish Terriers, Second Anglo-Afghan War, The Church of Jesus Christ and Latter-day Saints, Trichinopoly cigars, Sherlock Holmes, IIIT",
  'description': 'No description available',
  'ebook_available': False,
  'publishers': 'Unknown Publisher'},
 {'title': 'Treasure Island',
  'author': 'Robert Louis Stevens

### Science Fiction Books

In [11]:
combine_genre("science_fiction")


Books under the genre 'science_fiction':

1. Alice's Adventures in Wonderland by Lewis Carroll
2. Frankenstein or The Modern Prometheus by Mary Shelley
3. Fahrenheit 451 by Ray Bradbury
4. The War of the Worlds by H. G. Wells
5. The Giver by Lois Lowry
6. A Wrinkle in Time by Madeleine L'Engle


[{'title': "Alice's Adventures in Wonderland",
  'author': 'Lewis Carroll',
  'published_year': 1865,
  'subjects': "Alice (fictitious character : carroll), fiction, British and irish fiction (fictional works by one author), Fiction, fantasy, general, JUVENILE FICTION, classics, Fantasy & Magic, Imagination & Play, adventure and adventurers, adventure and adventurers, fiction, adventure stories, adventure travel, animals, anthropomorphism, artists' illustrated books, books and reading, child and youth fiction, children, children's fiction, children's literature, children's literature, english, children's stories, children's stories, english, classic literature, coloring books, croquet, cuentos infantiles ingleses, curiosidad, curiosidad en los niños, curiosity, curiosity in children, english, english adventure stories, english fantastic fiction, english fantasy fiction, english fantasy literature, english language, english literature, english nonsense verses, fairy tales, fantasy, fant

# Create table for putting books based on genre

In [3]:
def book_details(books_under_genre):
  book_data = {'ISBN': [],
               'Title': [],
               'Author': [],
               'Published_year': [],
               'Subject': []
               }


# Step 3 - Scrape User Ratings & Reviews

In [4]:
import requests
from bs4 import BeautifulSoup
import re
link = "https://openlibrary.org/subjects"
data = requests.get(link).text
response = requests.get(link)
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
def link2soup(link):
    data = requests.get(link).text
    return BeautifulSoup(data)


In [6]:
def parse_to_books(link):
    '''
    Assume we start in "https://openlibrary.org/subjects" (base link) page,
    crawls to each chosen subject link and add it to the base link.
    '''
    # assume start in "https://openlibrary.org/subjects" page
    chosen_subjects = {
        "romance", "fantasy", "historical_fiction", "horror", "humor",
        "literature", "mystery_and_detective_stories", "science_fiction"
    }

    soup = link2soup(link)

    # Extract all subjects links
    all_links = [a['href'] for a in soup.select("div#subjectsPage ul li a") if 'href' in a.attrs]
    # print(all_links[:5])

    # Filter only the chosen subjects
    filtered_links = [
        link for link in all_links
        # ensure only parsing on wanted subjects
        if any(sub in link for sub in chosen_subjects) and "juvenile_literature" not in link]
    # print(filtered_links)

    # assume we wish to click the 'fantasy' (specific subject) page
    # crawling through links
    specific_subject_urls = []
    for i in range(len(filtered_links)):
        next_button = filtered_links[i].split('/')[2]
        specific_subject_url = link + "/" + next_button
        specific_subject_urls.append(specific_subject_url)

    return(specific_subject_urls)

In [7]:
subject_links = parse_to_books(link)
subject_links

['https://openlibrary.org/subjects/fantasy',
 'https://openlibrary.org/subjects/historical_fiction',
 'https://openlibrary.org/subjects/horror',
 'https://openlibrary.org/subjects/humor',
 'https://openlibrary.org/subjects/literature',
 'https://openlibrary.org/subjects/mystery_and_detective_stories',
 'https://openlibrary.org/subjects/romance',
 'https://openlibrary.org/subjects/science_fiction']

In [8]:
# assumes we want to go to the total "works" under each subject
def total_works_page(specific_subject_urls):
    '''assumes we want to go to the total "works" under each subject

    Returns: all urls to 'total works' under each subject
    '''
    base_link = "https://openlibrary.org"
    total_works_list = []

    for url in specific_subject_urls:
        soup = link2soup(url)

        # Extract total works link
        total_works_link = [a['href'].replace(" ", "%20") for a in soup.select("a[title='See all works']") if 'href' in a.attrs]

        # Print extracted links (if any)
        print(f"Total works links for {url}: {total_works_link}")

        # Append full URLs to list:
        for link in total_works_link:
            total_works_list.append(base_link + link)

    return total_works_list

In [9]:
subject_total_work_url = total_works_page(subject_links)
subject_total_work_url

Total works links for https://openlibrary.org/subjects/fantasy: ['/search?subject=Fantasy']
Total works links for https://openlibrary.org/subjects/historical_fiction: ['/search?subject=Historical%20fiction']
Total works links for https://openlibrary.org/subjects/horror: ['/search?subject=Horror']
Total works links for https://openlibrary.org/subjects/humor: ['/search?subject=Humor']
Total works links for https://openlibrary.org/subjects/literature: ['/search?subject=Literature']
Total works links for https://openlibrary.org/subjects/mystery_and_detective_stories: ['/search?subject=Mystery%20and%20detective%20stories']
Total works links for https://openlibrary.org/subjects/romance: ['/search?subject=Romance']
Total works links for https://openlibrary.org/subjects/science_fiction: ['/search?subject=Science%20fiction']


['https://openlibrary.org/search?subject=Fantasy',
 'https://openlibrary.org/search?subject=Historical%20fiction',
 'https://openlibrary.org/search?subject=Horror',
 'https://openlibrary.org/search?subject=Humor',
 'https://openlibrary.org/search?subject=Literature',
 'https://openlibrary.org/search?subject=Mystery%20and%20detective%20stories',
 'https://openlibrary.org/search?subject=Romance',
 'https://openlibrary.org/search?subject=Science%20fiction']

In [10]:
def get_book_hrefs(url):
    soup = link2soup(url)
    endof_links = [a['href'] for a in soup.select("div#searchResults li.searchResultItem.sri--w-main a.results")
                    if 'href' in a.attrs]
    return endof_links

In [11]:
# Extracts the link to the next pages of "total works" under a specific subject
def every_page(subject_total_work_url):
    '''
    Assume we are in the first page of the 'total works' under each subject.
    Parse pages from the first page to the last under each subject.

    Return: URL of pages from 1 - n of 'total works' under each subject
    '''
    base_url = 'https://openlibrary.org'
    page_links = []

    for subject_url in subject_total_work_url:
        soup = link2soup(subject_url)

        # Find the last page number
        last_page = soup.select_one('a.ChoosePage[data-ol-link-track="Pager|LastPage"]')
        if last_page:
            last_page_number = int(last_page.get_text())
            # last_page_number = 3 for now for debugging
        else:
            last_page_number = 1  # If there's only one page

        # Generate all page links from 1 to last_page_number
        for page in range(1, last_page_number + 1):
            page_links.append(f'{subject_url}&page={page}')

    return page_links



In [12]:
all_pages = every_page(subject_total_work_url)
all_pages[:3]

['https://openlibrary.org/search?subject=Fantasy&page=1',
 'https://openlibrary.org/search?subject=Fantasy&page=2',
 'https://openlibrary.org/search?subject=Fantasy&page=3']

In [13]:
def book_links(urls):
    # EDITED: book links really accepts a broad list of "search" style URLs
    # use ThreadPoolExecutor to run code concurrently: this means that multiple
    #   requests may go out and be pending at the same; max_workers is set to
    #   100, but there might be a better value to pick!
    # add tqdm as a progress bar
    # there is still a concern regarding rate limiting: subsequent requests may
    #   get slower and slower or just return with an error (HTTP 422)
    '''
    Assumes we are in the "total works" section of each subject and wish to parse over every book.
    This also includes parsing the next pages continuously until the end.

    Return: URL of every book under "total works" section of each subject
    '''

    base_url = "https://openlibrary.org"
    book_links = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
        future_to_url = {executor.submit(get_book_hrefs, url): url for url in urls}
        print("SUBMITTED")
        with tqdm(total=len(urls)) as pbar:
            for future in concurrent.futures.as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    data = future.result()
                    book_links.extend(map(lambda href: base_url + href, data))
                except Exception as exc:
                    print('%r generated an exception: %s' % (url, exc))
                finally:
                    pbar.update(1)

    return book_links


In [18]:
# takes the first pages of each subject's total works urls and get all of the book links on each page
# then, write to a file named some_book_links.csv
some_book_links = book_links(subject_total_work_url)
with open('some_book_links.csv', 'w', newline='') as myfile:
     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
     wr.writerow(some_book_links)

SUBMITTED


100%|██████████| 8/8 [00:06<00:00,  1.19it/s]


In [22]:
# here is how you read something like some_book_links.csv
with open('some_book_links.csv', 'r') as myfile:
    rd = csv.reader(myfile)
    row = next(rd)

print(row)

['https://openlibrary.org/works/OL407498W?edition=key%3A/books/OL27869353M', 'https://openlibrary.org/works/OL3388961W?edition=key%3A/books/OL24961332M', 'https://openlibrary.org/works/OL85891W?edition=key%3A/books/OL57985422M', 'https://openlibrary.org/works/OL81634W?edition=key%3A/books/OL45621643M', 'https://openlibrary.org/works/OL81626W?edition=key%3A/books/OL35297182M', 'https://openlibrary.org/works/OL2895536W?edition=key%3A/books/OL37819235M', 'https://openlibrary.org/works/OL85892W?edition=key%3A/books/OL37044817M', 'https://openlibrary.org/works/OL1833989W?edition=key%3A/books/OL37044480M', 'https://openlibrary.org/works/OL81632W?edition=key%3A/books/OL3421116M', 'https://openlibrary.org/works/OL24156W?edition=key%3A/books/OL17959484M', 'https://openlibrary.org/works/OL2288676W?edition=key%3A/books/OL6671163M', 'https://openlibrary.org/works/OL8127201W?edition=key%3A/books/OL37044552M', 'https://openlibrary.org/works/OL183675W?edition=key%3A/books/OL37044696M', 'https://openl

In [19]:
# all_book_links uses all pages of each subject's total works and gets all of the book urls
# this takes a long time to run!
# make sure to save all_book_links.csv
all_book_links = book_links(all_pages)
with open('all_book_links.csv', 'w', newline='') as myfile:
     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
     wr.writerow(all_book_links)

SUBMITTED


  1%|          | 821/89622 [07:13<13:01:19,  1.89it/s]


KeyboardInterrupt: 

In [23]:
from tqdm import tqdm
import time

N=100
for i in tqdm(range(N)):
    x = i**2
    time.sleep(0.1)

for i, j in tqdm(enumerate(range(N)), total=N):
    x = i**2
    time.sleep(0.1)

100%|██████████| 100/100 [00:10<00:00,  9.89it/s]
100%|██████████| 100/100 [00:10<00:00,  9.89it/s]


In [None]:
len(enumerate(range(100)))

TypeError: object of type 'enumerate' has no len()

In [26]:
import concurrent.futures

from tqdm import tqdm

def every_book(all_pages, subject_total_work_url):
    """
    Assume we are in the first page of every 'total works' under a subject
    and wish to parse through every book in every page under the 'total works'.

    Returns: A list of URLs for every book from pages 1 to n under 'total works' of each subject.
    """
    all_book_links = []

    # with tqdm(total=len(all_pages)) as pbar:
    #     with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    #         future_to_page = {executor.submit(book_links, [page]): page for page in all_pages}
    #         for future in concurrent.futures.as_completed(future_to_page):
    #             page = future_to_page[future]
    #             try:
    #                 data = future.result()
    #                 all_book_links.extend(data)
    #             except Exception as exc:
    #                 print('%r generated an exception: %s' % (page, exc))
    #             finally:
    #                 pbar.update(1)

    return all_book_links



In [31]:
all_pages[2]

'https://openlibrary.org/search?subject=Fantasy&page=3'

In [34]:
x = book_links(all_pages[:2])

In [40]:
y = book_links([all_pages[0]]) + book_links([all_pages[1]])

In [42]:
x == y

True

In [27]:
all_books_url = every_book(all_pages, subject_total_work_url)

  0%|          | 142/89622 [01:44<18:21:43,  1.35it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/lib/python3.11/concurrent/futures/_base.py", line 243, in as_completed
    waiter.event.wait(wait_timeout)
  File "/usr/lib/python3.11/threading.py", line 629, in wait
    signaled = self._cond.wait(timeout)
               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/threading.py", line 327, in wait
    waiter.acquire()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<ipython-input-26-65afa2eae261>", line 17, in every_book
    for future in concurrent.futures.as_completed(future_to_page):
  File "/usr/lib/python3.11/concurrent/futures/_base.py", line 259, in as_completed
    f._waiters.remove(waiter)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.

TypeError: object of type 'NoneType' has no len()

In [None]:
def extract_book_reviews(subject):
    '''
    Extracts book reviews by categories (columns), paired with a percentage value. Returns a dataframe of books and
    '''
    subject_link = f'https://openlibrary.org/search?subject={subject}'
    response = requests.get(subject_link)
    soup = BeautifulSoup(response.text, 'html.parser')

    # extract book titles
    book_titles = every_book(subject) # make sure that the every_book function works

    # extract community reviews
    community_reviews = [x.get_text() for x in soup.select("span.reviews__value")]
    community_data = [re.search(r'\s*(\w[\w\s]*)',x).group(1).strip() for x in community_reviews]

    # extract percentages
    review_percentage = [x.get_text() for x in soup.select("span.percentage")]
    percentage_data = [re.search(r'\s*(\w[\w\s]*)',x).group(1).strip() for x in review_percentage]

    # ensure that community reviews and percentages (lists) are the same length
    if len(community_data) != len(percentage_data):
        print("The reviews and percentages do not match.")

    # find number of reviews
    number_of_reviews = [x.get_text() for x in soup.select("h2.observation-title")]
    number_of_reviews = [re.search(r'\((\d+)\)',x).group(1).strip() for x in number_of_reviews]

    # create an empty dict
    book_dict = {}

    # convert the community reviews and percentages into a dictionary
    for i in range(len(book_titles)): #loop through books
        review_dict = {'Title' : book_titles[i]}

        for j in range(len(community_data)):
            categories = community_data[j]
            percentage_value = review_percentage[j]

            review_dict[categories] = percentage_value

        book_dict.append(review_dict)

    # convert the dictionary into dataframe
    df = pd.Dataframe(book_dict)

    # return the dataframe
    return df

data_frame = extract_book_reviews('romance')
data_frame.head()

NameError: name 'every_book' is not defined

In [None]:
community_reviews = [x.get_text() for x in soup.select("span.reviews__value")]
community_data = [re.search(r'\s*(\w[\w\s]*)',x).group(1).strip() for x in community_reviews]
print(community_reviews)

[]


In [None]:
community_reviews = [x.get_text() for x in soup.select("span.reviews__value")]
community_data = [re.search(r'\s*(\w[\w\s]*)',x).group(1).strip() for x in community_reviews]

review_percentage = [x.get_text() for x in soup.select("span.percentage")]
percentage_data = [re.search(r'\s*(\w[\w\s]*)',x).group(1).strip() for x in review_percentage]

import pandas as pd
df = pd.DataFrame(data = {
    "Review" : community_data,
    "Ratings" : percentage_data
   # "Number of Reviews" : number_of_reviews

})
df

Unnamed: 0,Review,Ratings


In [None]:
number_of_reviews = [x.get_text() for x in soup.select("h2.observation-title")]
number_of_reviews = [re.search(r'\((\d+)\)',x).group(1).strip() for x in number_of_reviews]

In [None]:
book_button1 = [x.get_text() for x in soup.select('h3', class_= 'booktitle')]
book_button2 = [x.get_text() for x in soup.select('div', class_= 'book-cover')]


May need:

1. List of books that we like
2. Books in general (found above)

We can use both data to recommend books we might like (haven't read yet) - do this by finding all users who like the same books as us and then seeing what other books they like. We'll use those results to create that recommendation

## Collaborative Filtering

We only want to see books that have been reviewed more than 15 times.

# What to do next:
1. Build ML model
  - training data: csv file containing books in a specific genre?
  - testing data: our prediction now?

2. Approaches to consider:
  - Collaborative Filtering (based on user ratings, user reviews e.g. Goodreads)
  - Content-Based Filtering (based on genre, content description, etc.)
  - Combination of both Filtering Methods

3. Define Training Data
  - What should the csv file include?
    1. Book Information: Book ID, Title, Author, Genres, Description
    2. User Ratings: User ID, Book ID, Rating, User Reviews

4. Machine Learning Models to consider:
  - Content-Based Filtering: Book descriptions and genres
      - TF-IDF (Term Frequency-Inverse Document Frequency): evaluates the importance of a word in a document : https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/
      - Sci-Kit Learn: classifiers, feature-extraction
  - Collaborative Filtering: User ratings and reviews
      - Single Value Decomposition (SVD): can decompose a matrix into 3 matrices, good for ratings: https://www.geeksforgeeks.org/singular-value-decomposition-svd/
  - From surprise: https://surpriselib.com/


5. Hybrid model
  - Step 1: Get the top books for the user through collaborative filtering
  - Step 2: Find the most similar books through content based filtering
  - Step 3: Return the list of recommended books



In [None]:
# create dataframe (csv file) of books


In [None]:
# import SVD, import test train split
from surprise import SVD
from surprise.model_selection import test_train_split