## This is a file to test methods on how to gather data using the "Original dataset" csv file that is in this Data folder.

In [21]:
import pandas as pd
import requests
import numpy as np


## testing endpoint

In [2]:
book_id = "FBXRzgEACAAJ"
url = f"https://www.googleapis.com/books/v1/volumes/{book_id}"
response = requests.get(url)

if response.status_code == 200:
    book_data = response.json()
    # do something with the book data
else:
    print("Error:", response.status_code)

In [3]:
book_data["volumeInfo"]

{'title': 'Harry Potter and the Prisoner of Azkaban',
 'authors': ['J. K. Rowling'],
 'publisher': 'Scholastic, Incorporated',
 'publishedDate': '2004',
 'description': 'Harry Potter has to sneak back to Hogwarts, after accidentally inflating his horrible Aunt Petunia. But once there everyone is whispering about a prizoner who has escaped from the famous wizard prizon, Azkaban. His name is Sirius Black, and as a follower of Lord Voldemort he is determined to track Harry Potter down -- even if it means laying siege to the very walls of Hogwarts!',
 'industryIdentifiers': [{'type': 'ISBN_10', 'identifier': '043965548X'},
  {'type': 'ISBN_13', 'identifier': '9780439655484'}],
 'readingModes': {'text': False, 'image': False},
 'pageCount': 547,
 'printedPageCount': 547,
 'dimensions': {'height': '17.00 cm',
  'width': '9.50 cm',
  'thickness': '3.20 cm'},
 'printType': 'BOOK',
 'categories': ['Juvenile Fiction / Fantasy & Magic'],
 'averageRating': 4.5,
 'ratingsCount': 2122,
 'maturityRat

## writing a function to pull book data form the google books api

In [17]:
def get_book_data(df, col_name):
    # create an empty list to store the book data
    book_data_list = []
    url_base = 'https://www.googleapis.com/books/v1/volumes?q=id:'

    # iterate over the book ids, and get the information we need using the Google Books API
    for i, book_id in enumerate(df[col_name]):

        # generate the URL for the Google Books API search 
        url = url_base + book_id

        # get the book data; convert it to a dictionary using .json()
        book_data = requests.get(url).json()

        print(f'{i} {book_id}: {book_data.keys()}', end=': ')

        if 'totalItems' in book_data and book_data['totalItems'] > 0:

            print(book_data['totalItems'], end=': ')
            print(book_data['items'][0]['volumeInfo']['title'], end='')
            # extract the relevant fields and create a new row in the DataFrame
            book = {
                'book_id': book_id,
                'average_rating': book_data['items'][0]['volumeInfo'].get('averageRating', np.nan),
                'ratings_count': book_data['items'][0]['volumeInfo'].get('ratingsCount', np.nan),
            }

            # append the new dictionary to the book_data_list
            book_data_list.append(book)
        else:

            # handle the case where no results were returned for the book id
            book = {
                'book_id': book_id,
                'average_rating': np.nan,
                'ratings_count': np.nan,
            }
            book_data_list.append(book)

        print()

    # convert the list of dictionaries to a DataFrame
    response_df = pd.DataFrame(book_data_list)
    return response_df

## testing subset

In [18]:
subset = pd.read_csv("all_book_ids.csv")
subset = subset.head(5)
subset["book_id"]

0    FBXRzgEACAAJ
1    yyxXzQEACAAJ
2    YjAnfhsAQ8wC
3    xb4wSmJLnhAC
4    Qq9nQgAACAAJ
Name: book_id, dtype: object

In [19]:
test_data = get_book_data(subset, "book_id")
test_data

0 FBXRzgEACAAJ: dict_keys(['kind', 'totalItems']): 
1 yyxXzQEACAAJ: dict_keys(['kind', 'totalItems']): 
2 YjAnfhsAQ8wC: dict_keys(['kind', 'totalItems', 'items']): 17: Rosalind Franklin and DNA
3 xb4wSmJLnhAC: dict_keys(['kind', 'totalItems']): 
4 Qq9nQgAACAAJ: dict_keys(['kind', 'totalItems', 'items']): 1: Agile Web Development with Rails


Unnamed: 0,book_id,average_rating,ratings_count
0,FBXRzgEACAAJ,,
1,yyxXzQEACAAJ,,
2,YjAnfhsAQ8wC,4.0,2.0
3,xb4wSmJLnhAC,,
4,Qq9nQgAACAAJ,4.0,46.0


Using Google Books API to fix missing data in Kaggle Dataset

## Pulling all books

In [20]:
all_books = pd.read_csv("all_book_ids.csv")
all_data = get_book_data(all_books, "book_id")
all_data

0 FBXRzgEACAAJ: dict_keys(['kind', 'totalItems']): 
1 yyxXzQEACAAJ: dict_keys(['kind', 'totalItems']): 
2 YjAnfhsAQ8wC: dict_keys(['kind', 'totalItems', 'items']): 17: Rosalind Franklin and DNA
3 xb4wSmJLnhAC: dict_keys(['kind', 'totalItems']): 
4 Qq9nQgAACAAJ: dict_keys(['kind', 'totalItems', 'items']): 1: Agile Web Development with Rails
5 WO_cfDHDIYYC: dict_keys(['kind', 'totalItems', 'items']): 2: A Guide for Using Hatchet in the Classroom
6 Jtj9PAAACAAJ: dict_keys(['kind', 'totalItems']): 
7 JIVSPgAACAAJ: dict_keys(['kind', 'totalItems']): 
8 TpEoAQAACAAJ: dict_keys(['kind', 'totalItems']): 
9 4jGQEAAAQBAJ: dict_keys(['kind', 'totalItems', 'items']): 1: The Changeling
10 1uJSPgAACAAJ: dict_keys(['kind', 'totalItems']): 
11 9104ZLuR4Y0C: dict_keys(['kind', 'totalItems', 'items']): 3: Active Literacy Across the Curriculum
12 8jxkuQEACAAJ: dict_keys(['kind', 'totalItems']): 
13 H-AMAQAACAAJ: dict_keys(['kind', 'totalItems', 'items']): 2: Always Enough
14 8SBFuQEACAAJ: dict_keys(['kin

Unnamed: 0,book_id,average_rating,ratings_count
0,FBXRzgEACAAJ,,
1,yyxXzQEACAAJ,,
2,YjAnfhsAQ8wC,4.0,2.0
3,xb4wSmJLnhAC,,
4,Qq9nQgAACAAJ,4.0,46.0
...,...,...,...
3282,tcWMPAAACAAJ,,
3283,O2JfAAAAMAAJ,,
3284,y4kgSgAACAAJ,,
3285,TaQZzgEACAAJ,,


In [None]:
# Exporting isbn_book_df so i dont accidentally erase the data
all_data.to_csv('all_books_ratings.csv', index=False)