<a href="https://colab.research.google.com/github/maureenwidjaja/PIC16B-Group-Project/blob/main/PIC16B_group_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Scrape Books Using Open Library API
- get by 'Subject' name ->> can be anything, e.g. "fantasy" etc.



In [1]:
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pandas as pd
import numpy as np

In [19]:
import requests

def get_books_by_subject(subject, limit=10, details=True, ebooks=False, published_in=None, offset=0):
    '''
    Args:
    details: if True, includes related subjects, prolific authors, and publishers.
    ebooks: if True,  filters for books with e-books.
    published_in: filters by publication year.
                  For example:
                  http://openlibrary.org/subjects/love.json?published_in=1500-1600
    limit: num of works to include in the response, controls pagination.
    offset: starting offset in the total works, controls pagination.
    '''

    # Creates the API endpoint URL using the subject provided.
    base_url = (f'https://openlibrary.org/subjects/{subject}.json')

    # Dict to store query parameters.
    params = {
        "limit": limit,
        "offset": offset
    }

    # Check if details are "true", it will be added to the API request
    if details:
        params["details"] = "true"
    if ebooks:
        params["ebooks"] = "true"
    if published_in:
        params["published_in"] = published_in  # Example: "2000-2020"

    # Sends an HTTP GET request to Open Library's API with the query parameters
    # stored in params.
    # The response is stored in response, which contains JSON data.
    response = requests.get(base_url, params=params)

    # Checks if the request was successful
    if response.status_code == 200:
      data = response.json()
      # Retrieves the list of books (works).
      # If missing, returns an empty list.
      books = data.get("works", [])
      print(books)

    if not books:
      print("No books found for this subject.")
      return

    # Extract start and end year if range is given
    '''
    start_year, end_year = None, None
    if published_in and "-" in published_in:
      # convert string into two integers, e.g: 1900 and 2000.
      start_year, end_year = map(int, published_in.split("-"))

      print(f"Top {limit} books in {subject}:")
      for book in books:
        title = book.get("title", "Unknown Title")
        # Get author's first name
        author = book["authors"][0]["name"] if "authors" in book else "Unknown Author"
        published_year = book.get("first_publish_year", "Unknown Year")

        # Ensure published year is an integer before filtering
        if isinstance(published_year, int) and start_year and end_year:
        # Ensure book publication is between range
          if not (start_year <= published_year <= end_year):
            continue  # Skip books outside the range

        print(f"- {title} by {author} ({published_year})")

        # Display additional details

        if details:
          print("\nRelated Subjects:", data.get("related_subjects", []))
          print("Prolific Authors:", [author["name"] for author in data.get("prolific_authors", [])])
          publish_history = data.get("publishing_history")
          for year, count in publish_history:
            print(f"\nPublishing History: - {year}: {count} book(s)")

    else:
      print("Failed to retrieve data")
      '''

# Example: Get 5 books on 'love' published between 1900 and 2000
# get_books_by_subject("love", limit=10, details=True, published_in="1900-2000")
get_books_by_subject("love", limit=10)


[{'key': '/works/OL21177W', 'title': 'Wuthering Heights', 'edition_count': 2850, 'cover_id': 12818862, 'cover_edition_key': 'OL38586477M', 'subject': ['British and irish fiction (fictional works by one author)', "Children's fiction", 'Classic fiction', 'Classic Literature', 'Country homes', 'Country life', 'Cousins', 'Death', 'Drama', 'English language', 'English language readers', 'English literature', 'Examinations', 'Families', 'family life', 'Fiction', 'Foundlings', 'Historical Fiction', 'Inheritance and succession', 'Interpersonal relations', 'Juvenile fiction', 'Landscape in literature', 'love', 'Manners and customs', 'orphans', 'Psychological fiction', 'Reading Level-Grade 7', 'Reading Level-Grade 8', 'Reading Level-Grade 9', 'Reading Level-Grade 10', 'Reading Level-Grade 11', 'Reading Level-Grade 12', 'Rejection (Psychology)', 'revenge', 'romance', 'Romance fiction', 'romantic fiction', 'Rural families', 'slavery', 'Social life and customs', 'tragedy', 'Triangles (Interpersonal

In [15]:
# Make the GET request
page = requests.get('https://openlibrary.org/subjects/love.json')

# Parse the response as JSON
response = page.json()  # No need to use json.loads(page.text), requests has .json()

# Print some data
# print(response.keys())  # Shows available keys in the response
print(response['works'][2]['subject'])  # Prints details of the first book under 'love'


['Accident victims', 'American fiction (fictional works by one author)', 'American literature', "Children's fiction", 'Classic Literature', 'domestic fiction', 'English fiction', 'Family life', 'Farm life', 'Fiction', 'Guardian and ward', 'Interpersonal relations', 'Love', 'Manners and customs', 'Marriage', 'Married people', 'poor', 'Poverty', 'Readers', 'Romance', 'Rural poor', 'Social life and customs', 'Study and teaching (Secondary)', 'Triangles (Interpersonal relations)', 'Young women', 'Married people, fiction', 'Massachusetts, fiction', 'Man-woman relationships, fiction', 'Wharton, edith, 1862-1937', 'Young women, fiction', 'Fiction, romance, general', 'Unrequited love', 'New england, fiction', 'English language, textbooks for foreign speakers', 'Single women, fiction', 'Fiction, historical, general', 'Fiction, general', 'Triangle (Relations humaines)', 'Romans, nouvelles', "Victimes d'accidents", 'Couples mariés', 'Pauvres en milieu rural', 'Vie à la ferme', 'Large type books',

In [8]:
response

{'key': '/subjects/love',
 'name': 'love',
 'subject_type': 'subject',
 'work_count': 17395,
 'works': [{'key': '/works/OL21177W',
   'title': 'Wuthering Heights',
   'edition_count': 2850,
   'cover_id': 12818862,
   'cover_edition_key': 'OL38586477M',
   'subject': ['British and irish fiction (fictional works by one author)',
    "Children's fiction",
    'Classic fiction',
    'Classic Literature',
    'Country homes',
    'Country life',
    'Cousins',
    'Death',
    'Drama',
    'English language',
    'English language readers',
    'English literature',
    'Examinations',
    'Families',
    'family life',
    'Fiction',
    'Foundlings',
    'Historical Fiction',
    'Inheritance and succession',
    'Interpersonal relations',
    'Juvenile fiction',
    'Landscape in literature',
    'love',
    'Manners and customs',
    'orphans',
    'Psychological fiction',
    'Reading Level-Grade 7',
    'Reading Level-Grade 8',
    'Reading Level-Grade 9',
    'Reading Level-Grade 10

# What to do next:
1. Build ML model
  - training data: csv file containing books in a specific genre?
  - testing data: our prediction now?

2. Approaches to consider:
  - Collaborative Filtering (based on user ratings, user reviews e.g. Goodreads)
  - Content-Based Filtering (based on genre, content description, etc.)
  - Combination of both Filtering Methods

3. Define Training Data
  - What should the csv file include?
    1. Book Information: Book ID, Title, Author, Genres, Description
    2. User Ratings: User ID, Book ID, Rating, User Reviews

4. Machine Learning Models to consider:
  - Content-Based Filtering: Book descriptions and genres
      - TF-IDF (Term Frequency-Inverse Document Frequency): evaluates the importance of a word in a document : https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/
      - Sci-Kit Learn: classifiers, feature-extraction
  - Collaborative Filtering: User ratings and reviews
      - Single Value Decomposition (SVD): can decompose a matrix into 3 matrices, good for ratings: https://www.geeksforgeeks.org/singular-value-decomposition-svd/
  - From surprise: https://surpriselib.com/


5. Hybrid model
  - Step 1: Get the top books for the user through collaborative filtering
  - Step 2: Find the most similar books through content based filtering
  - Step 3: Return the list of recommended books



In [None]:
# create dataframe (csv file) of books


In [None]:
# import SVD, import test train split
from surprise import SVD
from surprise.model_selection import test_train_split