In [38]:
import requests 
from bs4 import BeautifulSoup
import requests_cache
import json
requests_cache.install_cache('demo_cache')


In [53]:
# Crawl goodreads URL to get book info

url = 'https://www.goodreads.com/book/show/19220614-chindi'
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

genres = []
author_name = 'Unknown Author'
number_of_pages = 0
book_name = ''
publication_date = '2000'

publication_date = soup.find("p", {"data-testid": "publicationInfo"}).text
if publication_date:
    publication_date = publication_date.split(' ')[-1]

for item in soup.select('a.Button--tag'):
    genre_url = item.attrs['href']
    if genre_url.startswith('https://www.goodreads.com/genres/'):
        genre = genre_url.split('/')[-1]
        genre_tag = genre_url.split('/genres/')[1]
        genres.append(genre_tag)

# Get ld+json data
ld_json_element = soup.find('script', type='application/ld+json')
if ld_json_element:
    ld_json_data = json.loads(ld_json_element.string)
    number_of_pages = ld_json_data['numberOfPages']
    author_name = ld_json_data['author'][0]['name']
    book_name = ld_json_data['name']
    
print(f"""
Book Name: {book_name}
Author: {author_name}
Publication Date: {publication_date}
Number of Pages: {number_of_pages}
Genres: {', '.join(genres)}""")


Book Name: Chindi (The Academy, #3)
Author: Jack McDevitt
Publication Date: 2002
Number of Pages: 594
Genres: science-fiction, fiction, space-opera, science-fiction-fantasy, adventure, audiobook, aliens


In [50]:
import joblib
import pandas as pd

# Load the saved model and encoders
model_data = joblib.load('goodreads_model_2.joblib')
random_forest_model = model_data['model']
author_encoder = model_data['author_encoder']
mlb = model_data['genre_encoder']

def predict_book_rating(title, author, year_published, num_pages, genres):
    
    book_features = pd.DataFrame(index=[0])
    
    # Handle unknown authors
    try:
        book_features['Author'] = author_encoder.transform([author])
    except ValueError:
        book_features['Author'] = author_encoder.transform(['Unknown Author'])
    
    book_features['Year Published'] = year_published
    book_features['Number of Pages'] = num_pages
    
    # Handle genres - only use known genres
    known_genres = list(set(genres) & set(mlb.classes_))
    if not known_genres:
        known_genres = ['']
        
    genre_features = pd.DataFrame(
        mlb.transform([known_genres]), 
        columns=mlb.classes_,
        index=[0]
    )
    
    book_features = pd.concat([book_features, genre_features], axis=1)
    predicted_rating = random_forest_model.predict(book_features)[0]
    
    return {
        'Title': title,
        'Predicted Rating': predicted_rating,
        'Rating Sentence': "You might like it" if predicted_rating >= 2.8 
                         else ("It might be okay" if predicted_rating >= 2.0 
                         else "You probably won't like it"),
        'Used Genres': known_genres
    }

In [None]:
predict_book_rating(title=book_name,
                    author=author_name,
                    year_published=publication_date,
                    num_pages=number_of_pages,
                    genres=genres
                    )

{'Title': 'Chindi (The Academy, #3)',
 'Predicted Rating': np.float64(2.8),
 'Rating Sentence': 'You might like it',
 'Used Genres': ['science-fiction', 'fiction']}