## This is a file to test methods on how to gather data using the "Original dataset" csv file that is in this Data folder.

In [1]:
import pandas as pd
import requests
import json

# Read Kaggle dataset, and extract the ISBN 13 for all the books

In [2]:
kaggle_data = pd.read_csv("books.csv")  # read the Kaggle books dataset
isbn_13 = kaggle_data['isbn13']         # extract the ISBN-13 column

isbn_13.shape 

(11127,)

In [3]:
isbn_13.head(5)

0    9780439785969
1    9780439358071
2    9780439554893
3    9780439655484
4    9780439682589
Name: isbn13, dtype: object

# Get the books from the Google Books API using the ISNB-13s 

In [7]:
# create an empty list to store the book data
book_data_list = []
url_base = 'https://www.googleapis.com/books/v1/volumes?q=isbn:'

# iterate over the ISBN 13 numbers, and get the information we need using the Google Books API
for i, isbn in enumerate(isbn_13):

    # generate the URL for the Google Books API search 
    url = url_base + isbn

    # get the book data; convert it to a dictionary using .json()
    book_data = requests.get(url).json()

    print(f'{i} {isbn}: {book_data.keys()}', end=': ')
    
    if 'totalItems' in book_data and book_data['totalItems'] > 0:

        print(book_data['totalItems'], end=': ')
        print(book_data['items'][0]['volumeInfo']['title'], end='')
        # extract the relevant fields and create a new row in the DataFrame
        book = {
            'title': book_data['items'][0]['volumeInfo'].get('title', ''),
            'subtitle': book_data['items'][0]['volumeInfo'].get('subtitle', ''),
            'authors': book_data['items'][0]['volumeInfo'].get('authors', ''),
            'publisher': book_data['items'][0]['volumeInfo'].get('publisher', ''),
            'publishdate': book_data['items'][0]['volumeInfo'].get('publishedDate', ''),
            'description': book_data['items'][0]['volumeInfo'].get('description', ''),
            'isbn_13': isbn,
            'page_count': book_data['items'][0]['volumeInfo'].get('pageCount', ''),
            'main_categories': book_data['items'][0]['volumeInfo'].get('categories', ''),
            'categories': book_data['items'][0]['volumeInfo'].get('categories', ''),
            'average_rating': book_data['items'][0]['volumeInfo'].get('averageRating', ''),
            'ratings_count': book_data['items'][0]['volumeInfo'].get('ratingsCount', ''),
            'maturity_rating': book_data['items'][0]['volumeInfo'].get('maturityRating', '')
        }

        # append the new dictionary to the book_data_list
        book_data_list.append(book)
    else:

        # handle the case where no results were returned for the ISBN-13 number
        # print(f"No results found for ISBN-13 {isbn}")
        pass
    
    print()

# convert the list of dictionaries to a DataFrame
response_df = pd.DataFrame(book_data_list)

0 9780439785969: dict_keys(['kind', 'totalItems']): 
1 9780439358071: dict_keys(['kind', 'totalItems']): 
2 9780439554893: dict_keys(['kind', 'totalItems', 'items']): 1: Harry Potter and the Chamber of Secrets
3 9780439655484: dict_keys(['kind', 'totalItems', 'items']): 1: Harry Potter and the Prisoner of Azkaban
4 9780439682589: dict_keys(['kind', 'totalItems', 'items']): 1: Harry Potter
5 9780976540601: dict_keys(['kind', 'totalItems', 'items']): 1: Unauthorized Harry Potter and the Deathly Hallows News
6 9780439827607: dict_keys(['kind', 'totalItems', 'items']): 2: The Harry Potter Collection
7 9780517226957: dict_keys(['kind', 'totalItems', 'items']): 1: The Ultimate Hitchhiker's Guide
8 9780345453747: dict_keys(['error']): 
9 9781400052929: dict_keys(['kind', 'totalItems', 'items']): 2: The Hitchhiker's Guide to the Galaxy 25th Anniversary Edition
10 9780739322208: dict_keys(['kind', 'totalItems']): 
11 9780517149256: dict_keys(['kind', 'totalItems', 'items']): 1: The Ultimate Hit

KeyboardInterrupt: 

In [19]:
response_df

Unnamed: 0,title,subtitle,authors,publisher,publishdate,description,isbn_13,page_count,main_categories,categories,average_rating,ratings_count,maturity_rating
0,Harry Potter and the Chamber of Secrets,,"[J. K. Rowling, Mary GrandPre]",Arthur a Levine,2003,When the Chamber of Secrets is opened again at...,9780439554893,341,[Juvenile Fiction],[Juvenile Fiction],4.5,2273,NOT_MATURE
1,Harry Potter and the Prisoner of Azkaban,,[J. K. Rowling],Scholastic Paperbacks,2004,During his third year at Hogwarts School for W...,9780439655484,547,[Juvenile Fiction],[Juvenile Fiction],4.5,2122,NOT_MATURE
2,Harry Potter,"5 Years of Magic, Adventure, and Mystery at Ho...",[J. K. Rowling],,2004,,9780439682589,0,,,4.5,13,NOT_MATURE
3,Unauthorized Harry Potter and the Deathly Hall...,Harry Potter Book Seven and Half-Blood Prince ...,[W. Frederick Zimmerman],Nimble Books,2005-04,Through the magic of print-on-demand technolog...,9780976540601,152,[Fiction],[Fiction],3.5,11,NOT_MATURE
4,The Harry Potter Collection,The First Six Spellbinding Adventures at Hogwarts,[J. K. Rowling],Arthur a Levine,2005-10-01,The first six years of Harry Potter magic are ...,9780439827607,,[Juvenile Fiction],[Juvenile Fiction],4.5,16,NOT_MATURE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2756,El Alquimista: Edicion Illustrada,,[Paulo Coelho],Harper Collins,2007-04-10,"Exuberante, evocativo y profundamente humano, ...",9780061351341,200,[Fiction],[Fiction],5,2,NOT_MATURE
2757,Call of the Mall,How We Shop,[Paco Underhill],Profile Business,2004,"Like Underhill's bestseller, Why We Buy, this ...",9781861974426,227,[Consumer behavior],[Consumer behavior],,,NOT_MATURE
2758,The Die Broke Financial Problem Solver,Six Steps to Overcoming All Your Money Problems,"[Stephen M. Pollan, Mark Levine]",Harper Paperbacks,2000-12-26,If you're loosing sleep over your financial wo...,9780066619910,272,[Business & Economics],[Business & Economics],,,NOT_MATURE
2759,Fantastic Mr Fox,,[David Wood],Samuel French Limited,2003,Adaptation into a play of the Roald Dahl class...,9780573051333,43,[Drama],[Drama],3,1,NOT_MATURE


In [22]:
response_df.to_csv('isbn13_results.csv')

Using Google Books API to fix missing data in Kaggle Dataset

In [None]:
# get the faulty ISBN
isbn = isbn_13.loc[0]
print(isbn)

# url for the book
url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"

# get the book
response = requests.get(url)

# convert the HTTP Response object to a dict
json = response.json()

# get the keys for the dict
json.keys()

In [None]:
temp = temp

In [None]:
print(f'in_google: {in_google}\nnot_in_google: {not_in_google}')

In [None]:
json['items'][0]['volumeInfo']