In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

# API Calls

In [None]:
# Load the datasets
interactions = pd.read_csv('https://raw.githubusercontent.com/olivialaven/MGT502_project/refs/heads/main/interactions_train.csv')
items = pd.read_csv("https://raw.githubusercontent.com/olivialaven/MGT502_project/refs/heads/main/merged_items_updated.csv")

## API data collection

### Google API

In [None]:
import pandas as pd
import requests
from tqdm import tqdm

def get_book_info_from_isbns(isbn_list):
    tried_isbns = []
    fallback_info = None

    for isbn in isbn_list:
        isbn = isbn.strip()
        tried_isbns.append(isbn)
        response = requests.get(f'https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}')
        if response.status_code == 200:
            data = response.json()
            if data.get('totalItems', 0) > 0:
                volume_info = data['items'][0]['volumeInfo']
                info = {
                    'used_isbn': isbn,
                    'all_tried_isbns': ','.join(tried_isbns),
                    'description': volume_info.get('description'),
                    'subtitle': volume_info.get('subtitle'),
                    'authors': ', '.join(volume_info.get('authors', []))
                }
                if info['description']:
                    return info
                elif not fallback_info:
                    fallback_info = info  # Store the first valid book without a description

    return fallback_info if fallback_info else {
        'used_isbn': None,
        'all_tried_isbns': ','.join(tried_isbns),
        'description': None,
        'subtitle': None,
        'authors': None
    }

def process_books_dataframe(df, isbn_column='isbn_column'):
    results = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing books"):
        isbn_raw = row[isbn_column]
        isbn_list = [isbn.strip() for isbn in isbn_raw.split(';') if isbn.strip()]
        info = get_book_info_from_isbns(isbn_list)
        results.append(info)
    return pd.DataFrame(results)


In [None]:
cleaned_items = items.copy()
cleaned_items['ISBN Valid'] = cleaned_items['ISBN Valid'].fillna('').astype(str)

df_metadata = process_books_dataframe(cleaned_items, isbn_column='ISBN Valid')

### API ISBNdb

In [None]:
import pandas as pd
import requests
import time
import re
from tqdm import tqdm

API_KEY = '61313_6b38dc96969ed35d555df99cd5bf36b8'
API_URL = 'https://api2.isbndb.com/books'
HEADERS = {
    'Authorization': API_KEY,
    'accept': 'application/json',
    'Content-Type': 'application/json',
}
BATCH_SIZE = 100
API_SLEEP = 1  

df = items  

def extract_isbn(text):
    if isinstance(text, str):
        for val in re.split(r'[;,]', text):
            cleaned = val.strip().replace('-', '')
            if cleaned.isdigit() and len(cleaned) in (10, 13):
                return cleaned
    return None

df['first_isbn'] = df['ISBN Valid'].apply(extract_isbn)

unique_isbns = df['first_isbn'].dropna().drop_duplicates().tolist()
isbn_batches = [unique_isbns[i:i + BATCH_SIZE] for i in range(0, len(unique_isbns), BATCH_SIZE)]

book_data = []

for batch in tqdm(isbn_batches, desc="Fetching ISBNs", unit="batch"):
    try:
        payload_str = 'isbns=' + ','.join(batch)
        response = requests.post(API_URL, headers=HEADERS, data=payload_str)

        if response.status_code == 200:
            books = response.json().get('data', [])
            book_data.extend(books)
    except Exception:
        pass
    time.sleep(API_SLEEP)

books_df = pd.DataFrame(book_data)
books_df.to_csv('isbn_enriched_data.csv', index=False)

if not books_df.empty and 'isbn13' in books_df.columns:
    merged_df = df.merge(books_df, how='left', left_on='first_isbn', right_on='isbn13')
    merged_df.to_csv('merged_books.csv', index=False)

Fetching ISBNs: 100%|██████████| 145/145 [02:55<00:00,  1.21s/batch]


In [16]:
merged_df = df.merge(books_df, how='left', left_on='first_isbn', right_on='isbn13')
merged_df.to_csv('api_enriched_books.csv', index=False)

KeyError: 'isbn13'