<a href="https://colab.research.google.com/github/maureenwidjaja/PIC16B-Group-Project/blob/main/PIC16B_group_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Scrape Books Using Open Library API
- get by 'Subject' name ->> can be anything, e.g. "fantasy" etc.



In [1]:
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pandas as pd
import numpy as np

In [23]:
import requests

def get_books_by_subject(subject, limit=100, details=True, ebooks=False, published_in=None, offset=0):
    '''
    Args:
    details: if True, includes related subjects, prolific authors, and publishers.
    ebooks: if True,  filters for books with e-books.
    published_in: filters by publication year.
                  For example:
                  http://openlibrary.org/subjects/love.json?published_in=1500-1600
    limit: num of works to include in the response, controls pagination.
    offset: starting offset in the total works, controls pagination.
    '''
    # Creates the API endpoint URL using the subject provided.
    base_url = (f'https://openlibrary.org/subjects/{subject}.json?limit=100')

    # Dict to store query parameters.
    params = {
        "limit": limit,
        "offset": offset
    }

    # Sends an HTTP GET request to Open Library's API with the query parameters
    # stored in params.
    # The response is stored in response, which contains JSON data.
    response = requests.get(base_url)#, params=params)

    if response.status_code != 200:
        print(f"Error fetching books for {subject}")
        return []

    data = response.json()
    books = data.get("works", [])

    if not books:
        print(f"No books found for {subject}")
        return []
    """
    book_list = []
    for book in books:
        title = book.get("title", "Unknown Title")
        author = book["authors"][0]["name"] if book.get("authors") else "Unknown Author"
        published_year = book.get("first_publish_year", "Unknown Year")

        book_list.append(f"{title} by {author} ({published_year})")
    """

    return books


# Step 2: Combine into one genre
For instance, "sci-fi" subject and "science-fiction" subject returns different results. So, our next objective is to combine all of them into one genre "Science Fiction". The same goes for other genres like "Romance" or "Fantasy".


### Generating books for "Romance"

In [24]:
def combine_genre(subject):
    """
    Args:
    subject: book subject

    This function collects book lists under sub-genres and combines them into
    one main genre.

    Returns:
    List of all books under a specific genre.
    """

    if subject is None:
        raise ValueError("Please pass a subject name.")

    # Dictionary of genres and their corresponding lists with adjusted formatting
    genre_dict = {
        "romance": [
            "fiction_romance_general", "fiction_romance_historical_general",
            "romance", "man_woman_relationships", "fiction_romance_suspense",
            "fiction_romance_contemporary",
            "fiction_romance_erotica", "fiction_romance_erotic",
            "marriage_fiction", "fiction_erotica_general", "romance",
            "fiction_christian_romance_general", "fiction_romance_historical"
        ],
        "fantasy": [
            "fiction", "fantasy_fiction", "magic", "fiction_fantasy_general",
            "adventure_and_adventurers_fiction",
            "adventure_and_adventurers", "good_and_evil", "fairies", "dragons",
            "cartoons_and_comics", "witchcraft", "history", "wizards", "fairies_fiction"
        ],
        "historical_fiction": [
            "fiction", "historical_fiction", "history", "fiction_historical_general",
            "fiction_romance_historical_general", "fiction_historical", "fiction_general",
            "fiction_romance_historical", "world_war_1939_1945", "great_britain_fiction"
        ],
        "horror": [
            "fiction", "horror", "horror_stories", "horror_tales", "american_horror_tales",
            "horror_fiction", "detective_and_mystery_stories", "crime", "catalepsy", "murder",
            "burial_vaults"
        ],
        "humor": [
            "anecdotes", "humor_general", "american_wit_and_humor",
            "wit_and_humor", "caricatures_and_cartoons", "humour", "humor"
        ],
        "literature": [
            "philosophy", "in_literature", "theory", "criticism", "criticism_and_interpretation",
            "english_literature", "modern_literature", "american_literature",
            "literature", "litterature"
        ],
        "mystery_thriller": [
            "detective_and_mystery_stories", "mystery_fiction", "murder", "mystery",
            "thriller", "detective", "fiction_thrillers_general",
            "suspense", "fiction_thrillers_suspense", "fiction_suspense",
            "mystery", "fiction_mystery_and_detective_general", "thriller", "murder",
            "fiction_thrillers_espionage", "police", "fiction_action_and_adventure",
            "suspense_fiction", "fiction_general", "detective_and_mystery_stories",
            "crimes_against", "fiction_psychological", "investigation"
        ],
        "science_fiction": [
            "science_fiction", "fiction_science_fiction_general", "american_science_fiction",
            "extraterrestrial_beings", "life_on_other_planets", "extraterrestrial_beings_fiction",
            "time_travel_science_fiction"
        ]
    }

    if subject not in genre_dict:
        raise ValueError("Invalid genre. Please choose from the predefined genres: \
        Romance, Fantasy, Historical Fiction, Horror, Humor, Literature, \
        Mystery & Thriller, Science Fiction.")

    books_under_genre = []
    i = 1
    print(f"\nBooks under the genre '{subject}':\n")

    # Loop through each sub-genre and collect books
    for sub_genre in genre_dict[subject]:
        books = get_books_by_subject(sub_genre)  # Get books for the sub-genre

        # Print the books for this sub-genre
        if books:
            for book in books:
                print(f"{i}. {book}")
                i += 1

            books_under_genre.extend(books)  # Add books to the main list
        else:
            print(f"No books found for sub-genre '{sub_genre}'")

    return books_under_genre

*italicized text*### Romance books:

In [22]:
romance_books = combine_genre("romance")
print(romance_books)


Books under the genre 'romance':

['Pride and Prejudice by Jane Austen (1813)', 'Wuthering Heights by Emily Brontë (1846)', 'Emma by Jane Austen (1815)', 'Little Women by Louisa May Alcott (1848)', 'Don Quixote by Miguel de Cervantes Saavedra (1600)', 'Persuasion by Jane Austen (1789)', 'The Great Gatsby by F. Scott Fitzgerald (1920)', 'Mansfield Park by Jane Austen (1814)', 'Northanger Abbey by Jane Austen (1818)', 'Ethan Frome by Edith Wharton (1910)', 'David Copperfield by Charles Dickens (1800)', 'Jane Eyre by Charlotte Brontë (1847)', 'Anne of the Island by Lucy Maud Montgomery (1826)', 'The Cricket on the Hearth by Charles Dickens (1846)', 'The House of Mirth by Edith Wharton (1905)', 'The Scarlet Letter by Nathaniel Hawthorne (1850)', 'Lady Susan by Jane Austen (1925)', 'Women in Love by David Herbert Lawrence (1877)', 'Villette, a novel by Charlotte Brontë (1853)', 'This Side of Paradise by F. Scott Fitzgerald (1920)', 'The Beautiful and Damned by F. Scott Fitzgerald (1922)

### Fantasy books:

In [None]:
fantasy_books = combine_genre("fantasy")
print(fantasy_books)


Books under the genre 'fantasy':

1. Pride and Prejudice by Jane Austen (1813)
2. Alice's Adventures in Wonderland by Lewis Carroll (1865)
3. Wuthering Heights by Emily Brontë (1846)
4. A Christmas Carol by Charles Dickens (1843)
5. Adventures of Huckleberry Finn by Mark Twain (1876)
6. The Picture of Dorian Gray by Oscar Wilde (1890)
7. Emma by Jane Austen (1815)
8. Oliver Twist by Charles Dickens (1822)
9. Frankenstein or The Modern Prometheus by Mary Shelley (1818)
10. A Tale of Two Cities by Charles Dickens (1800)
11. The Wonderful Wizard of Oz by L. Frank Baum (1899)
12. Sense and Sensibility by Jane Austen (1811)
13. Treasure Island by Robert Louis Stevenson (1880)
14. Little Women by Louisa May Alcott (1848)
15. Gulliver's Travels by Jonathan Swift (1726)
16. Don Quixote by Miguel de Cervantes Saavedra (1600)
17. Great Expectations by Charles Dickens (1861)
18. A Study in Scarlet by Arthur Conan Doyle (1887)
19. The Art of War by Sun Tzu (1900)
20. The Prince by Niccolò Machia

### Historical Fiction books:

In [None]:
his_fic_books = combine_genre("historical_fiction")
print(his_fic_books)


Books under the genre 'historical_fiction':

1. Pride and Prejudice by Jane Austen (1813)
2. Alice's Adventures in Wonderland by Lewis Carroll (1865)
3. Wuthering Heights by Emily Brontë (1846)
4. A Christmas Carol by Charles Dickens (1843)
5. Adventures of Huckleberry Finn by Mark Twain (1876)
6. The Picture of Dorian Gray by Oscar Wilde (1890)
7. Emma by Jane Austen (1815)
8. Oliver Twist by Charles Dickens (1822)
9. Frankenstein or The Modern Prometheus by Mary Shelley (1818)
10. A Tale of Two Cities by Charles Dickens (1800)
11. The Wonderful Wizard of Oz by L. Frank Baum (1899)
12. Sense and Sensibility by Jane Austen (1811)
13. Treasure Island by Robert Louis Stevenson (1880)
14. Little Women by Louisa May Alcott (1848)
15. Gulliver's Travels by Jonathan Swift (1726)
16. Don Quixote by Miguel de Cervantes Saavedra (1600)
17. Great Expectations by Charles Dickens (1861)
18. A Study in Scarlet by Arthur Conan Doyle (1887)
19. The Art of War by Sun Tzu (1900)
20. The Prince by Nic

# Horror Books

In [None]:
horror_books = combine_genre("horror")
print(horror_books)

# What to do next:
1. Build ML model
  - training data: csv file containing books in a specific genre?
  - testing data: our prediction now?

2. Approaches to consider:
  - Collaborative Filtering (based on user ratings, user reviews e.g. Goodreads)
  - Content-Based Filtering (based on genre, content description, etc.)
  - Combination of both Filtering Methods

3. Define Training Data
  - What should the csv file include?
    1. Book Information: Book ID, Title, Author, Genres, Description
    2. User Ratings: User ID, Book ID, Rating, User Reviews

4. Machine Learning Models to consider:
  - Content-Based Filtering: Book descriptions and genres
      - TF-IDF (Term Frequency-Inverse Document Frequency): evaluates the importance of a word in a document : https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/
      - Sci-Kit Learn: classifiers, feature-extraction
  - Collaborative Filtering: User ratings and reviews
      - Single Value Decomposition (SVD): can decompose a matrix into 3 matrices, good for ratings: https://www.geeksforgeeks.org/singular-value-decomposition-svd/
  - From surprise: https://surpriselib.com/


5. Hybrid model
  - Step 1: Get the top books for the user through collaborative filtering
  - Step 2: Find the most similar books through content based filtering
  - Step 3: Return the list of recommended books



In [None]:
# create dataframe (csv file) of books


In [None]:
# import SVD, import test train split
from surprise import SVD
from surprise.model_selection import test_train_split