## This is a file to test methods on how to gather data using the "Original dataset" csv file that is in this Data folder.

In [90]:
import pandas as pd
import requests
import json

# Read Kaggle dataset, and extract the ISBN 13 for all the books

In [91]:
kaggle_data = pd.read_csv("books.csv")  # read the Kaggle books dataset
isbn_13 = kaggle_data['isbn13']         # extract the ISBN-13 column

isbn_13.head(5)

0    9780439785969
1    9780439358071
2    9780439554893
3    9780439655484
4    9780439682589
Name: isbn13, dtype: object

# Get the books from the Google Books API using the ISNB-13s 

In [133]:
# create an empty list to store the book data
book_data_list = []
url_base = 'https://www.googleapis.com/books/v1/volumes?q=isbn:'

# iterate over the ISBN 13 numbers, and get the information we need using the Google Books API
for i, isbn in enumerate(isbn_13):
    print(i, isbn)

    # generate the URL for the Google Books API search 
    url = url_base + isbn

    # get the book data; convert it to a dictionary using .json()
    book_data = requests.get(url).json()
    
    if 'totalItems' in book_data and book_data['totalItems'] > 0:
        # extract the relevant fields and create a new row in the DataFrame
        book = {
            'title': book_data['items'][0]['volumeInfo'].get('title', ''),
            'subtitle': book_data['items'][0]['volumeInfo'].get('subtitle', ''),
            'authors': book_data['items'][0]['volumeInfo'].get('authors', ''),
            'publisher': book_data['items'][0]['volumeInfo'].get('publisher', ''),
            'publishdate': book_data['items'][0]['volumeInfo'].get('publishedDate', ''),
            'description': book_data['items'][0]['volumeInfo'].get('description', ''),
            'isbn_10': book_data['items'][0]['volumeInfo'].get('industryIdentifiers', [{'type': '', 'identifier': ''}])[0].get('identifier', ''),
            'isbn_13': book_data['items'][0]['volumeInfo'].get('industryIdentifiers', [{'type': '', 'identifier': ''}])[1].get('identifier', ''),
            'page_count': book_data['items'][0]['volumeInfo'].get('pageCount', ''),
            'main_categories': book_data['items'][0]['volumeInfo'].get('categories', ''),
            'categories': book_data['items'][0]['volumeInfo'].get('categories', ''),
            'average_rating': book_data['items'][0]['volumeInfo'].get('averageRating', ''),
            'ratings_count': book_data['items'][0]['volumeInfo'].get('ratingsCount', ''),
            'maturity_rating': book_data['items'][0]['volumeInfo'].get('maturityRating', '')
        }

        # append the new dictionary to the book_data_list
        book_data_list.append(book)
    else:
            # handle the case where no results were returned for the ISBN-13 number
        print(f"No results found for ISBN-13 {isbn}")

# convert the list of dictionaries to a DataFrame
response_df = pd.DataFrame(book_data_list)

0 9780439785969
No results found for ISBN-13 9780439785969
1 9780439358071
No results found for ISBN-13 9780439358071
2 9780439554893
3 9780439655484
4 9780439682589
5 9780976540601
6 9780439827607
7 9780517226957


IndexError: list index out of range

Using Google Books API to fix missing data in Kaggle Dataset

In [130]:
# get the faulty ISBN
isbn = isbn_13.iloc[21] 

# url for the book
url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"

# get the book
response = requests.get(url)

# convert the HTTP Response object to a dict
json = response.json()

# get the keys for the dict
json.keys()

dict_keys(['kind', 'totalItems', 'items'])

In [129]:
json['items'][0]

{'kind': 'books#volume',
 'id': 'XBaNEAAAQBAJ',
 'etag': 'E6G2h794Cik',
 'selfLink': 'https://www.googleapis.com/books/v1/volumes/XBaNEAAAQBAJ',
 'volumeInfo': {'title': 'The Hobbit / The Lord of the Rings',
  'subtitle': 'The Hobbit / The Fellowship of the Ring / The Two Towers / The Return of the King',
  'authors': ['John Ronald Reuel Tolkien'],
  'publisher': 'National Geographic Books',
  'publishedDate': '2012-09-25',
  'description': 'Presents a box set including the complete "Lord of the Rings" trilogy, as well as its prequel, "The Hobbit."',
  'industryIdentifiers': [{'type': 'ISBN_13', 'identifier': '9780345538376'},
   {'type': 'ISBN_10', 'identifier': '0345538374'}],
  'readingModes': {'text': True, 'image': False},
  'pageCount': 0,
  'printType': 'BOOK',
  'categories': ['Fiction'],
  'averageRating': 4,
  'ratingsCount': 7,
  'maturityRating': 'NOT_MATURE',
  'allowAnonLogging': False,
  'contentVersion': 'preview-1.0.0',
  'panelizationSummary': {'containsEpubBubbles': 