## This is a file to test methods on how to gather data using the "Original dataset" csv file that is in this Data folder.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Original dataset.csv")
df2 = df.head()
df2

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9780000000000.0,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,439358078,9780000000000.0,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780000000000.0,eng,352,6333,244,11/1/2003,Scholastic,
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780000000000.0,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,439682584,9780000000000.0,eng,2690,41428,164,9/13/2004,Scholastic,


In [4]:
for isbn in df2['isbn']:
    print(isbn) #print each ISBN from each row

439785960
439358078
439554896
043965548X
439682584


## To this point I can read in a CSV and select a specific column and its contents

### Set up requests

In [2]:
import requests

In [6]:
# Define a function to call the Google Books API and extract the page count and description
def get_book_info(isbn):
    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"
    response = requests.get(url)
    data = response.json()
    if 'totalItems' in data and data['totalItems'] > 0:
        book_info = data['items'][0]['volumeInfo']
        if 'description' in book_info:
            description = book_info['description']
        else:
            description = None
        if 'mainCategory' in book_info:
            main_category = book_info['mainCategory']
        else:
            main_category = None
        if 'categories' in book_info:
            categories = ', '.join(book_info['categories'])
        else:
            categories = None
        return description, main_category, categories
    else:
        return None, None, None

# Create a new dataframe to store the book info
descriptions = []
main_categories = []
categories_list = []
for isbn in df['isbn']:
    description, main_category, categories = get_book_info(isbn)
    descriptions.append(description)
    main_categories.append(main_category)
    categories_list.append(categories)

results_df = pd.DataFrame({
    'Main Category': main_categories,
    'Categories': categories_list,
    'Description': descriptions
}, index=df['isbn'])

# Print the first few rows of the results dataframe
print(results_df.head())

           Main Category        Categories  \
isbn                                         
439785960           None              None   
439358078           None              None   
439554896           None              None   
043965548X          None  Juvenile Fiction   
439682584           None              None   

                                                  Description  
isbn                                                           
439785960                                                None  
439358078                                                None  
439554896                                                None  
043965548X  During his third year at Hogwarts School for W...  
439682584                                                None  


In [7]:
#print df
results_df

Unnamed: 0_level_0,Main Category,Categories,Description
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
439785960,,,
439358078,,,
439554896,,,
043965548X,,Juvenile Fiction,During his third year at Hogwarts School for W...
439682584,,,
...,...,...,...
1560254416,,Fiction,"No stranger to living and writing on the edge,..."
140110879,,,
140131965,,,
60878827,,,


In [8]:
results_df.to_csv('three_cols.csv')

In [10]:
three_cols_df = pd.read_csv("three_cols.csv")
three_cols_df

Unnamed: 0,isbn,Main Category,Categories,Description
0,439785960,,,
1,439358078,,,
2,439554896,,,
3,043965548X,,Juvenile Fiction,During his third year at Hogwarts School for W...
4,439682584,,,
...,...,...,...,...
11122,1560254416,,Fiction,"No stranger to living and writing on the edge,..."
11123,140110879,,,
11124,140131965,,,
11125,60878827,,,


In [24]:
temp_df = pd.merge(df,
                 three_cols_df,
                 on='isbn')
temp_df = temp_df.drop(columns='Unnamed: 12')
temp_df

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Main Category,Categories,Description
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9.78E+12,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,,,
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,439358078,9.78E+12,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,,,
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9.78E+12,eng,352,6333,244,11/1/2003,Scholastic,,,
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9.78E+12,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,,Juvenile Fiction,During his third year at Hogwarts School for W...
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,439682584,9.78E+12,eng,2690,41428,164,9/13/2004,Scholastic,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11122,45631,Expelled from Eden: A William T. Vollmann Reader,William T. Vollmann/Larry McCaffery/Michael He...,4.06,1560254416,9.78E+12,eng,512,156,20,12/21/2004,Da Capo Press,,Fiction,"No stranger to living and writing on the edge,..."
11123,45633,You Bright and Risen Angels,William T. Vollmann,4.08,140110879,9.78E+12,eng,635,783,56,12/1/1988,Penguin Books,,,
11124,45634,The Ice-Shirt (Seven Dreams #1),William T. Vollmann,3.96,140131965,9.78E+12,eng,415,820,95,8/1/1993,Penguin Books,,,
11125,45639,Poor People,William T. Vollmann,3.72,60878827,9.78E+12,eng,434,769,139,2/27/2007,Ecco,,,


In [None]:
temp_df.to_csv('Big_data.csv')

## testing other api

In [22]:
isbn = "" # Example ISBN



## This is just trying the efficiency of a suggested code.

## I didnt finish running it.

In [23]:
from concurrent.futures import ThreadPoolExecutor

def fetch_book_info(isbn):
    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"
    response = requests.get(url)
    data = response.json()
    if 'totalItems' in data and data['totalItems'] > 0:
        book_info = data['items'][0]['volumeInfo']
        description = book_info.get('description', None)
        main_category = book_info.get('mainCategory', None)
        categories = ', '.join(book_info.get('categories', []))
        return description, main_category, categories
    else:
        return None, None, None

def fetch_book_info_batch(isbn_list):
    results = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(fetch_book_info, isbn) for isbn in isbn_list]
        for future in futures:
            results.append(future.result())
    return results

isbn_list = df['isbn'].tolist()
batch_size = 100
results = []
for i in range(0, len(isbn_list), batch_size):
    isbn_batch = isbn_list[i:i+batch_size]
    batch_results = fetch_book_info_batch(isbn_batch)
    results.extend(batch_results)

descriptions, main_categories, categories_list = zip(*results)
results_df2 = pd.DataFrame({
    'Main Category': main_categories,
    'Categories': categories_list,
    'Description': descriptions
}, index=df['isbn'])

print(results_df2.head())