# CSV + API

In this reboot, we are going to use:

- The [Goodreads books](https://www.kaggle.com/jealousleopard/goodreadsbooks) dataset from Kaggle.
- The [Open Library Books API](https://openlibrary.org/dev/docs/api/books)

The goal of this livecode is to load the data from a CSV + loop over rows to enrich each row with information such as:

- List of subjects (Science, Humor, Travel, etc.)
- The cover URL of the book
- Other information you'd find useful in the JSON API

First, download the CSV in the local folder:

In [None]:
!curl -L https://gist.githubusercontent.com/ssaunier/351b17f5a7a009808b60aeacd1f4a036/raw/books.csv > books.csv

In [None]:
!ls -lh

Then import the usual suspects!

In [None]:
import requests
import pandas as pd
import numpy as np

## Load books from CSV

In [None]:
books_df = pd.read_csv('books.csv', on_bad_lines='skip')
books_df = books_df.drop(columns=['bookID', 'isbn', 'average_rating', 'language_code', 'ratings_count', 'text_reviews_count'])
books_df

In [None]:
books_df.dtypes

Let's add a new column

In [None]:
books_df['cover_url'] = None
books_df.head()

## API - Open Library

In [None]:
def fetch_book(isbn):
    # Define URL to be queried
    url = 'https://openlibrary.org/api/books'
    
    # Define parameters to send with the HTTP request
    params = {
        'bibkeys': f'ISBN:{isbn}',
        'format': 'json',
        'jscmd': 'data'
    }
    
    # Perform the request
    response = requests.get(url, params=params).json()
    
    # Check whether ISBN is in the response
    if f'ISBN:{isbn}' in response:
        return response[f'ISBN:{isbn}']
    else:
        return ''

In [None]:
%%time

# TODO: for row in rows => fetch_book => add column

for index, row in books_df.head(15).iterrows():
    # If the book has no cover URL, fetch it
    if row['cover_url'] is None:
        isbn = row['isbn13']
        print(f"Fetching cover for {row['title']}")
        
        book = fetch_book(isbn)
        
        # If a book is found with that ISBN, fetch the cover URL
        if book:
            cover_url = book.get('cover', {}).get('large', '')
            books_df.loc[index, 'cover_url'] = cover_url
        else:
            books_df.loc[index, 'cover_url'] = ''

In [None]:
books_df.head(15)

## Calling the API with multiple ISBNs at a time

In [None]:
isbns = [9780439785969, 9780439358071, 9780439554930]
[f"ISBN:{isbn}" for isbn in isbns]

In [None]:
",".join([f"ISBN:{isbn}" for isbn in isbns])

In [None]:
def fetch_books(isbns):
    # Define the URL and build bibkeys from ISBN
    url = "https://openlibrary.org/api/books"
    bibkeys = ",".join([f"ISBN:{isbn}" for isbn in isbns])
    
    # Define parameters for HTTP request
    params = {
        'bibkeys': bibkeys,
        'format': 'json',
        'jscmd': 'data'
    }
    
    # Perform request
    response = requests.get(url, params=params).json()
    
    return response

In [None]:
books_df.set_index("isbn13", inplace=True)

In [None]:
books_df.head()

In [None]:
!pip install tqdm

In [None]:
%%time

from tqdm import tqdm

for group in tqdm(np.array_split(books_df.head(100), 5)): # 5 groups of 20 books
    books = fetch_books(list(group.index))
    
    for isbn_code, book in books.items():
        isbn = int(isbn_code.strip("ISBN:"))
        books_df.loc[isbn, "cover_url"] = book.get("cover", {}).get("large", "")

In [None]:
books_df.head(20)