In [6]:
import os
import json
from bs4 import BeautifulSoup
import google.generativeai as genai
from markdownify import markdownify as md

In [8]:

# 1️⃣ Configure Gemini API
api_token = os.getenv("GEMINI_API_TOKEN")
if not api_token:
    raise ValueError("GEMINI_API_TOKEN environment variable not set.")
genai.configure(api_key=api_token)


In [5]:
# 2️⃣ Load scraped markdown
try:
    with open("scraped_output.md", "r", encoding="utf-8") as f:
        markdown_data = f.read()
except FileNotFoundError:
    print("Error: 'scraped_output.md' file not found. Please run crawler.py first.")
    exit()

# 3️⃣ Split into product blocks (separated by blank lines)
products = [p.strip() for p in markdown_data.strip().split("\n\n") if p.strip()]

for product in products:
  print(product)

<div class="Ms6aG"><div class="qmXQo"><div class="ICdUp"><div class="_95X4G"><a href="//www.daraz.com.np/products/infinix-gt-30-pro-5g-8256gb-678-inch-ltps-amoled-144hz-16002000-nits-peak-brightness-mediatek-dimensity-8350-ultimat-5500mah-battery-45w-wired-30w-wireless-charging-i420086621.html"><div class="picture-wrapper jBwCF"><img alt="Infinix GT 30 Pro 5G (8/256GB) | 6.78-inch LTPS AMOLED, 144Hz, 1,600–2,000 nits peak brightness| MediaTek Dimensity 8350 Ultimat | 5500mAh Battery | 45W wired, 30W wireless charging" height="200" src="https://img.drz.lazcdn.com/static/np/p/4526baf66c793880543835aa7c02b7ab.png_200x200q80.avif" style="object-fit: fill;" type="product" width="200"/></div></a></div></div><div class="buTCk"><div class="ajfs+"></div><div class="RfADt"><a href="//www.daraz.com.np/products/infinix-gt-30-pro-5g-8256gb-678-inch-ltps-amoled-144hz-16002000-nits-peak-brightness-mediatek-dimensity-8350-ultimat-5500mah-battery-45w-wired-30w-wireless-charging-i420086621.html" title="

In [12]:
import json

# Initialize model
model = genai.GenerativeModel("gemini-2.5-flash-lite")
# The output_data list is no longer needed, as we write immediately

# Use 'a' (append) mode for the file, and open it *before* the loop
# We'll use a .jsonl extension to indicate the format
OUTPUT_FILENAME = "products.jsonl" 

print(f"Starting extraction. Writing to {OUTPUT_FILENAME}...")

# Clear the file or ensure it's a new file (optional, depends on need)
# This step is optional: if you want to clear previous runs, uncomment the line below:
# with open(OUTPUT_FILENAME, "w", encoding="utf-8") as f: pass 


for product_md in products:
    markdown_output = md(product_md)
    # Prompt Gemini only for the missing fields
    prompt = f"""
    Extract the following fields from the markdown content and return JSON only:
    - url
    - photo
    - title
    - price
    - units_sold
    - rating
    - location

    If a field is not found, set it to null. Do not add extra text or surrounding markdown/code blocks.

    Markdown content:
    ```html
    {markdown_output}
    ```
    """

    try:
        response = model.generate_content(
            prompt,
            generation_config={"response_mime_type": "application/json"}
        )

        # 1. Load the JSON response
        gemini_data = json.loads(response.text)

        # 2. Append the JSON object as a new line to the file
        with open(OUTPUT_FILENAME, "a", encoding="utf-8") as f:
            # json.dumps converts the Python dict to a JSON string
            # We then write the string followed by a newline character
            f.write(json.dumps(gemini_data, ensure_ascii=False) + "\n")

    except Exception as e:
        # Better error logging to see which product failed
        print(f"⚠️ Failed to process product. Content start: '{markdown_output[:50]}...'. Error: {e}")

print(f"✅ Extraction complete. Saved to {OUTPUT_FILENAME}")

Starting extraction. Writing to products.jsonl...
⚠️ Failed to process product. Content start: '[![Blackview N2000 FLIP 4G MOBILE PHONE | RED | BL...'. Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 15
Please retry in 18.243953794s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash-lite"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 18
}
]
⚠️ Failed to process product. Conte