## This is a file to test methods on how to gather data using the "Original dataset" csv file that is in this Data folder.

In [90]:
import pandas as pd
import requests
import json

# Read Kaggle dataset, and extract the ISBN 13 for all the books

In [200]:
kaggle_data = pd.read_csv("books.csv")  # read the Kaggle books dataset
isbn_13 = kaggle_data['isbn13']         # extract the ISBN-13 column

isbn_13.shape 

(11127,)

In [189]:
isbn_13.head(5)

0    9780439785969
1    9780439358071
2    9780439554893
3    9780439655484
4    9780439682589
Name: isbn13, dtype: object

# Get the books from the Google Books API using the ISNB-13s 

In [215]:
# create an empty list to store the book data
book_data_list = []
url_base = 'https://www.googleapis.com/books/v1/volumes?q=isbn:'

# iterate over the ISBN 13 numbers, and get the information we need using the Google Books API
for i, isbn in enumerate(isbn_13):
    print(i, ' ', isbn, end=': ')

    # generate the URL for the Google Books API search 
    url = url_base + isbn

    # get the book data; convert it to a dictionary using .json()
    book_data = requests.get(url).json()

    print(book_data.keys(), end=': ')
    
    if 'totalItems' in book_data and book_data['totalItems'] > 0:

        print(book_data['totalItems'], end=': ')
        print(book_data['items'][0]['volumeInfo']['title'], end='')
        # extract the relevant fields and create a new row in the DataFrame
        book = {
            'title': book_data['items'][0]['volumeInfo'].get('title', ''),
            'subtitle': book_data['items'][0]['volumeInfo'].get('subtitle', ''),
            'authors': book_data['items'][0]['volumeInfo'].get('authors', ''),
            'publisher': book_data['items'][0]['volumeInfo'].get('publisher', ''),
            'publishdate': book_data['items'][0]['volumeInfo'].get('publishedDate', ''),
            'description': book_data['items'][0]['volumeInfo'].get('description', ''),
            # 'isbn_10': book_data['items'][0]['volumeInfo'].get('industryIdentifiers', [{'type': '', 'identifier': ''}])[0].get('identifier', ''),
            # 'isbn_13': book_data['items'][0]['volumeInfo'].get('industryIdentifiers', [{'type': '', 'identifier': ''}])[1].get('identifier', ''),
            'page_count': book_data['items'][0]['volumeInfo'].get('pageCount', ''),
            'main_categories': book_data['items'][0]['volumeInfo'].get('categories', ''),
            'categories': book_data['items'][0]['volumeInfo'].get('categories', ''),
            'average_rating': book_data['items'][0]['volumeInfo'].get('averageRating', ''),
            'ratings_count': book_data['items'][0]['volumeInfo'].get('ratingsCount', ''),
            'maturity_rating': book_data['items'][0]['volumeInfo'].get('maturityRating', '')
        }
        # append the new dictionary to the book_data_list
        book_data_list.append(book)
    else:

        # handle the case where no results were returned for the ISBN-13 number
        # print(f"No results found for ISBN-13 {isbn}")
        pass

    print()
# convert the list of dictionaries to a DataFrame
response_df = pd.DataFrame(book_data_list)

0   9780439785969: dict_keys(['kind', 'totalItems']): 
1   9780439358071: dict_keys(['kind', 'totalItems']): 
2   9780439554893: dict_keys(['kind', 'totalItems', 'items']): 1: Harry Potter and the Chamber of Secrets
3   9780439655484: dict_keys(['kind', 'totalItems', 'items']): 1: Harry Potter and the Prisoner of Azkaban
4   9780439682589: dict_keys(['kind', 'totalItems', 'items']): 1: Harry Potter
5   9780976540601: dict_keys(['kind', 'totalItems', 'items']): 1: Unauthorized Harry Potter and the Deathly Hallows News
6   9780439827607: dict_keys(['kind', 'totalItems', 'items']): 2: The Harry Potter Collection
7   9780517226957: dict_keys(['kind', 'totalItems', 'items']): 1: The Ultimate Hitchhiker's Guide
8   9780345453747: dict_keys(['kind', 'totalItems', 'items']): 1: The Ultimate Hitchhiker's Guide to the Galaxy
9   9781400052929: dict_keys(['kind', 'totalItems', 'items']): 2: The Hitchhiker's Guide to the Galaxy 25th Anniversary Edition
10   9780739322208: dict_keys(['kind', 'total

KeyboardInterrupt: 

Using Google Books API to fix missing data in Kaggle Dataset

In [197]:
# get the faulty ISBN
isbn = isbn_13.loc[0]
print(isbn)

# url for the book
url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"

# get the book
response = requests.get(url)

# convert the HTTP Response object to a dict
json = response.json()

# get the keys for the dict
json.keys()

9780439785969


dict_keys(['kind', 'totalItems'])

In [186]:
print(f'in_google: {in_google}\nnot_in_google: {not_in_google}')

in_google: 575
not_in_google: 318


In [180]:
json['items'][0]['volumeInfo']

{'title': "The Ultimate Hitchhiker's Guide",
 'authors': ['Douglas Adams'],
 'publisher': 'Wings',
 'publishedDate': '1996',
 'description': '6 Science fiction-romaner.',
 'industryIdentifiers': [{'type': 'OTHER',
   'identifier': 'UOM:39015060636746'}],
 'readingModes': {'text': False, 'image': False},
 'pageCount': 840,
 'printType': 'BOOK',
 'categories': ['Dent, Arthur (Fictitious character)'],
 'averageRating': 4.5,
 'ratingsCount': 16,
 'maturityRating': 'NOT_MATURE',
 'allowAnonLogging': False,
 'contentVersion': '0.2.2.0.preview.0',
 'panelizationSummary': {'containsEpubBubbles': False,
  'containsImageBubbles': False},
 'imageLinks': {'smallThumbnail': 'http://books.google.com/books/content?id=oL1lAAAAMAAJ&printsec=frontcover&img=1&zoom=5&source=gbs_api',
  'thumbnail': 'http://books.google.com/books/content?id=oL1lAAAAMAAJ&printsec=frontcover&img=1&zoom=1&source=gbs_api'},
 'language': 'en',
 'previewLink': 'http://books.google.com/books?id=oL1lAAAAMAAJ&dq=isbn:9780517149256&