## This is a file to test methods on how to gather data using the "Original dataset" csv file that is in this Data folder.

In [1]:
import pandas as pd
import requests
import json
import spacy


## testing endpoint

In [12]:
book_id = "FBXRzgEACAAJ"
url = f"https://www.googleapis.com/books/v1/volumes/{book_id}"
response = requests.get(url)

if response.status_code == 200:
    book_data = response.json()
    # do something with the book data
else:
    print("Error:", response.status_code)

In [13]:
book_data["volumeInfo"]

{'title': 'Harry Potter and the Prisoner of Azkaban',
 'authors': ['J. K. Rowling'],
 'publisher': 'Scholastic, Incorporated',
 'publishedDate': '2004',
 'description': 'Harry Potter has to sneak back to Hogwarts, after accidentally inflating his horrible Aunt Petunia. But once there everyone is whispering about a prizoner who has escaped from the famous wizard prizon, Azkaban. His name is Sirius Black, and as a follower of Lord Voldemort he is determined to track Harry Potter down -- even if it means laying siege to the very walls of Hogwarts!',
 'industryIdentifiers': [{'type': 'ISBN_10', 'identifier': '043965548X'},
  {'type': 'ISBN_13', 'identifier': '9780439655484'}],
 'readingModes': {'text': False, 'image': False},
 'pageCount': 547,
 'printedPageCount': 547,
 'dimensions': {'height': '17.00 cm',
  'width': '9.50 cm',
  'thickness': '3.20 cm'},
 'printType': 'BOOK',
 'categories': ['Juvenile Fiction / Fantasy & Magic'],
 'averageRating': 4.5,
 'ratingsCount': 2122,
 'maturityRat

## writing a function to pull book data form the google books api

In [10]:
def get_book_data(df, col_name):
    # create an empty list to store the book data
    book_data_list = []
    url_base = 'https://www.googleapis.com/books/v1/volumes?q=id:'

    # iterate over the book ids, and get the information we need using the Google Books API
    for i, book_id in enumerate(df[col_name]):

        # generate the URL for the Google Books API search 
        url = url_base + book_id

        # get the book data; convert it to a dictionary using .json()
        book_data = requests.get(url).json()

        print(f'{i} {book_id}: {book_data.keys()}', end=': ')

        if 'totalItems' in book_data and book_data['totalItems'] > 0:

            print(book_data['totalItems'], end=': ')
            print(book_data['items'][0]['volumeInfo']['title'], end='')
            # extract the relevant fields and create a new row in the DataFrame
            book = {
                'book_id': book_id,
                'average_rating': book_data['items'][0]['volumeInfo'].get('averageRating', ''),
                'ratings_count': book_data['items'][0]['volumeInfo'].get('ratingsCount', ''),
            }

            # append the new dictionary to the book_data_list
            book_data_list.append(book)
        else:

            # handle the case where no results were returned for the book id
            # print(f"No results found for book id {book_id}")
            pass

        print()

    # convert the list of dictionaries to a DataFrame
    response_df = pd.DataFrame(book_data_list)
    return response_df

## testing subset

In [11]:
subset = pd.read_csv("all_book_ids.csv")
subset = subset.head(5)
subset

Unnamed: 0,book_id
0,FBXRzgEACAAJ
1,yyxXzQEACAAJ
2,YjAnfhsAQ8wC
3,xb4wSmJLnhAC
4,Qq9nQgAACAAJ


In [42]:
response_df

Unnamed: 0,title,subtitle,authors,publisher,publishdate,description,isbn_13,page_count,main_categories,categories,average_rating,ratings_count,maturity_rating
0,Harry Potter and the Chamber of Secrets,,"[J. K. Rowling, Mary GrandPre]",Arthur a Levine,2003,When the Chamber of Secrets is opened again at...,9780439554893,341,[Juvenile Fiction],[Juvenile Fiction],4.5,2273.0,NOT_MATURE
1,Harry Potter and the Prisoner of Azkaban,,[J. K. Rowling],Scholastic Paperbacks,2004,During his third year at Hogwarts School for W...,9780439655484,547,[Juvenile Fiction],[Juvenile Fiction],4.5,2122.0,NOT_MATURE
2,Harry Potter,"5 Years of Magic, Adventure, and Mystery at Ho...",[J. K. Rowling],,2004,,9780439682589,0,,,4.5,13.0,NOT_MATURE
3,Unauthorized Harry Potter and the Deathly Hall...,Harry Potter Book Seven and Half-Blood Prince ...,[W. Frederick Zimmerman],Nimble Books,2005-04,Through the magic of print-on-demand technolog...,9780976540601,152,[Fiction],[Fiction],3.5,11.0,NOT_MATURE
4,Harry Potter and the Prisoner of Azkaban,,[J. K. Rowling],,1999,"""During his third year at Hogwarts School for ...",9780439827607,435,[Children's stories],[Children's stories],4.5,16.0,NOT_MATURE
5,The Ultimate Hitchhiker's Guide,Five Complete Novels and One Story,[Douglas Adams],Gramercy,2005,6 Science fiction-romaner.,9780517226957,844,"[Dent, Arthur (Fictitious character)]","[Dent, Arthur (Fictitious character)]",4.5,35.0,NOT_MATURE
6,The Ultimate Hitchhiker's Guide to the Galaxy,,[Douglas Adams],Del Rey,2002-04-30,"In one complete volume, here are the five clas...",9780345453747,836,[Fiction],[Fiction],4.5,179.0,NOT_MATURE
7,The Hitchhiker's Guide to the Galaxy 25th Anni...,A Novel,[Douglas Adams],Crown,2004-08-03,NEW YORK TIMES BESTSELLER • “Extremely funny ....,9781400052929,0,[Fiction],[Fiction],,,NOT_MATURE


In [10]:
#commented out to not edit huge csv
#response_df.to_csv('isbn13_results.csv')

Using Google Books API to fix missing data in Kaggle Dataset

In [11]:
# get the faulty ISBN
isbn = isbn_13.loc[0]
print(isbn)

# url for the book
url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"

# get the book
response = requests.get(url)

# convert the HTTP Response object to a dict
json = response.json()

# get the keys for the dict
json.keys()

9780439785969


dict_keys(['kind', 'totalItems'])

# testing SpaCy

### 1 and 1 

In [39]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_lg")

# Process whole documents
book_d_1 = nlp(response_df["description"][0])
book_d_2 = nlp(response_df["description"][1])

book_d_1.similarity(book_d_2)

0.8970893140160056

### 1 and many

In [47]:
# Load LARGE English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_lg")

# pulling and creating variables for books descriptions
book_d_1 = nlp(response_df["description"][0])
book_d_2 = nlp(response_df["description"][1])
book_d_3 = nlp(response_df["description"][2])
book_d_4 = nlp(response_df["description"][3])
book_d_5 = nlp(response_df["description"][4])
book_d_6 = nlp(response_df["description"][5])

# Descriptions from books
texts = [book_d_2, book_d_3, book_d_4, book_d_5, book_d_6]

# Create a list of Doc objects
docs = [nlp(text) for text in texts]

# Compare doc1 with each of the docs in the list
for doc in docs:
    similarity_score = book_d_1.similarity(doc)
    head_text = doc.text[:5]  # Get the first 5 characters of the text
    print(f"Similarity score with '{head_text}': {similarity_score}")

Similarity score with 'Durin': 0.8970893140160056
Similarity score with '': 0.0
Similarity score with 'Throu': 0.8815107060053393
Similarity score with '"Duri': 0.8823296614181213
Similarity score with '6 Sci': 0.4377405652863445


  similarity_score = book_d_1.similarity(doc)
