In [21]:
import requests
import csv
import time

# Hàm lấy thông tin chi tiết 1 cuốn sách từ API Tiki
def crawl_book_info(product_id):
    url = f"https://tiki.vn/api/v2/products/{product_id}"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Lỗi khi lấy sản phẩm ID {product_id}")
        return None

    data = response.json()

    # Lấy specifications gộp lại thành 1 chuỗi
    specs_data = data.get("specifications", [])
    specs_flat = []
    for group in specs_data:
        attributes = group.get("attributes", [])
        for attr in attributes:
            name = attr.get("name")
            value = attr.get("value")
            if name and value:
                specs_flat.append(f"{name}: {value}")
    specifications = " | ".join(specs_flat)

    # Breadcrumbs: lấy toàn bộ đường dẫn danh mục
    breadcrumbs_data = data.get("breadcrumbs", [])
    category_path = None
    if isinstance(breadcrumbs_data, list) and breadcrumbs_data:
        category_path = " > ".join([
            b.get("name", "") for b in breadcrumbs_data[1:-1] if isinstance(b, dict)
        ])

    # Tạo dict lưu thông tin sách
    book = {
        "id": product_id,
        "name": data.get("name"),
        "price": data.get("price"),
        "original_price": data.get("original_price"),
        "rating_average": data.get("rating_average"),
        "review_count": data.get("review_count"),
        "author": data["authors"][0]["name"] if data.get("authors") else None,
        "category_path": category_path,
        "quantity_sold": data.get("quantity_sold", {}).get("value"),
        "inventory_status": data.get("inventory_status"),
        "specifications": specifications
    }

    return book




In [22]:
# Hàm đọc danh sách ID từ file text
def read_product_ids(filename="../data_export/product_ids.txt"):
    with open(filename, "r", encoding="utf-8") as f:
        ids = [line.strip() for line in f if line.strip()]
    return ids


In [23]:
# Hàm lưu danh sách sách vào CSV
def save_books_to_csv(books, filename="../data_export/books.csv"):
    if not books:
        print("Không có dữ liệu để lưu!")
        return
    
    keys = books[0].keys()
    with open(filename, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        writer.writerows(books)
    
    print(f"✅ Đã lưu {len(books)} sách vào file '{filename}'")


In [None]:
# ======== CHẠY TOÀN BỘ =========

product_ids = read_product_ids()  # đọc 10 ID đầu tiên
books = []

for i, pid in enumerate(product_ids):
    print(f"📘 [{i+1}/{len(product_ids)}] Crawl ID: {pid}")
    book = crawl_book_info(pid)
    if book:
        books.append(book)
    time.sleep(0.5)  # tránh bị block do gửi quá nhanh

save_books_to_csv(books)


📘 [1/2000] Crawl ID: 277728224
📘 [2/2000] Crawl ID: 277412034
📘 [3/2000] Crawl ID: 277409853
📘 [4/2000] Crawl ID: 277381902
📘 [5/2000] Crawl ID: 276960030
📘 [6/2000] Crawl ID: 276948798
📘 [7/2000] Crawl ID: 276922095
📘 [8/2000] Crawl ID: 276823777
📘 [9/2000] Crawl ID: 276346703
📘 [10/2000] Crawl ID: 276159943
📘 [11/2000] Crawl ID: 275702538
📘 [12/2000] Crawl ID: 275406600
📘 [13/2000] Crawl ID: 275243138
📘 [14/2000] Crawl ID: 274363543
📘 [15/2000] Crawl ID: 273819808
📘 [16/2000] Crawl ID: 272449818
📘 [17/2000] Crawl ID: 272000024
📘 [18/2000] Crawl ID: 271380890
📘 [19/2000] Crawl ID: 263070154
📘 [20/2000] Crawl ID: 262590428
📘 [21/2000] Crawl ID: 261982682
📘 [22/2000] Crawl ID: 229714486
📘 [23/2000] Crawl ID: 212597849
📘 [24/2000] Crawl ID: 209389165
📘 [25/2000] Crawl ID: 207256575
📘 [26/2000] Crawl ID: 202879294
📘 [27/2000] Crawl ID: 199582064
📘 [28/2000] Crawl ID: 195291930
📘 [29/2000] Crawl ID: 194960730
📘 [30/2000] Crawl ID: 193614514
📘 [31/2000] Crawl ID: 170708233
📘 [32/2000] Crawl