In [9]:
import pandas as pd 


In [None]:
import pandas as pd
import requests
import time
import os

def fetch_google_metadata(isbn):
    """Direct API Flow for ISBN retrieval."""
    isbn_clean = str(isbn).split('.')[0].strip()
    if not isbn_clean or isbn_clean == 'nan' or isbn_clean == 'None':
        return None
        
    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn_clean}"
    try:
        # High-contrast logic: 10s timeout to prevent system hangs
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if "items" in data:
                info = data["items"][0]["volumeInfo"]
                return {
                    "Title": str(info.get("title", "N/A")),
                    "Author": ", ".join(info.get("authors", ["Unknown"])),
                    "publisher": str(info.get("publisher", "N/A")),
                    "publication_date": str(info.get("publishedDate", "N/A"))
                }
    except Exception:
        pass
    return None

def process_library_fast(input_csv, output_csv):
    # 1. Load the structural layer
    if not os.path.exists(input_csv):
        print(f"Error: {input_csv} not found.")
        return
        
    df = pd.read_csv(input_csv)
    original_df = df.copy() # Pure copy for final verification
    total_rows = len(df)
    
    # 2. Logic Orange: Identify the best search column
    search_col = None
    for col in ['isbn13', 'ISBN', 'isbn']:
        if col in df.columns:
            search_col = col
            break

    if not search_col:
        print("Critical Error: No ISBN column found in the dataset.")
        return

    # Systemic Fix: Force columns to String type to prevent float64 collision
    for col in ['Title', 'Author', 'publisher', 'publication_date']:
        if col not in df.columns:
            df[col] = ""
        df[col] = df[col].astype(str)
        
    # Identification of Murky (NaN) rows
    nan_rows = df[df[search_col].isna()].index.tolist()
    
    print(f"Total rows: {total_rows}. Starting Enrichment using {search_col}...")
    print(f"Murky Rows (NaN ISBN) to skip: {len(nan_rows)}")

    # 3. Active Flow: Synchronized Row-by-row mapping
    for index, row in df.iterrows():
        if index in nan_rows:
            continue

        isbn = row[search_col]
        metadata = fetch_google_metadata(isbn)
        
        if metadata:
            # Overwrite targeting specific columns
            df.at[index, 'Title'] = metadata['Title']
            df.at[index, 'Author'] = metadata['Author']
            df.at[index, 'publisher'] = metadata['publisher']
            df.at[index, 'publication_date'] = metadata['publication_date']
        
        # 4. Structural Integrity: Save progress every 50 rows
        if (index + 1) % 50 == 0:
            df.to_csv(output_csv, index=False)
            print(f"--- Progress: {((index+1)/total_rows)*100:.2f}% ---", end="\r")

    # 5. Final Save and Audit Report
    df.to_csv(output_csv, index=False)
    print(f"\nGrid Enriched. System saved to {output_csv}")
    
    # --- Final Integrity Audit ---
    mismatches = []
    title_col = next((c for c in ['Title', 'title'] if c in original_df.columns), "Title")
    
    for idx in range(total_rows):
        if idx in nan_rows: continue
        orig_t = str(original_df.at[idx, title_col]).lower().strip()
        new_t = str(df.at[idx, 'Title']).lower().strip()
        
        if orig_t != new_t and new_t not in ["nan", "", "n/a"]:
            mismatches.append(idx)

    print("\n" + "="*40)
    print("--- INTEGRATED GENESIS FINAL REPORT ---")
    print(f"Total Grid Size:      {total_rows}")
    print(f"Successfully Enriched: {total_rows - len(nan_rows)}")
    print(f"Dropped (NaN ISBN):    {len(nan_rows)}")
    print(f"Title Discrepancies:   {len(mismatches)}")
    print("="*40)

# --- EXECUTION ---
process_library_fast("books_catalogue_clean.csv", "library_enriched.csv")

Enriching 10000 rows via isbn...
