In [None]:
!apt-get update
!apt-get install -y git
!pip install openai playwright pandas tqdm
!pip install --upgrade openai
!playwright install

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (18.160.213.72)] [Connected to r2u.stat.i                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                                                    Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
                                                                                                    Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:8 https://ppa.launchpadcontent.

In [None]:
!git clone https://github.com/uiverse-io/galaxy.git /content/GALAXY

Cloning into '/content/GALAXY'...
remote: Enumerating objects: 15872, done.[K
remote: Counting objects: 100% (219/219), done.[K
remote: Compressing objects: 100% (109/109), done.[K
remote: Total 15872 (delta 178), reused 110 (delta 110), pack-reused 15653 (from 3)[K
Receiving objects: 100% (15872/15872), 4.86 MiB | 8.91 MiB/s, done.
Resolving deltas: 100% (10378/10378), done.


In [None]:
import os
import csv
import asyncio
import base64
import shutil
from openai import OpenAI
from playwright.async_api import async_playwright
import nest_asyncio

# 🔹 Apply patch for Jupyter or Colab async
nest_asyncio.apply()

client = OpenAI(api_key="")

# 🔹 Load Prompt Template
def load_prompt(category, file_name, code):
    with open("prompt.md", "r", encoding="utf-8") as f:
        template = f.read()
    return (
        template.replace("{{category}}", category)
                .replace("{{file_name}}", file_name)
                .replace("{{code}}", code)
    )

# 🔹 Generate AI Filename & Description
def get_ai_filename_and_description(category, file_name, code):
    prompt = load_prompt(category, file_name, code)

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an expert UI component naming and documentation assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        ai_output = response.choices[0].message.content.strip()

        filename, description = "", ""
        for line in ai_output.split("\n"):
            if line.lower().startswith("filename:"):
                filename = line.split(":", 1)[1].strip()
            elif line.lower().startswith("description:"):
                description = line.split(":", 1)[1].strip()

        if not filename.endswith(".html"):
            filename = f"{category}_unknown.html"

        return filename, description

    except Exception as e:
        print(f"[AI Error: {file_name}] {e}")
        return f"{category}_unknown.html", "Description not available."

# 🔹 Crop and Capture Component Screenshot
async def capture_cropped_screenshot(file_path):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(f"file://{file_path}")
        await page.wait_for_timeout(1000)

        # Try to detect the main component visually
        element = await page.query_selector("body")
        bounding_box = await element.bounding_box()

        screenshot_path = file_path.replace(".html", ".png")
        await page.screenshot(path=screenshot_path, clip=bounding_box)
        await browser.close()

    return screenshot_path

# 🔹 Main Processing Function
async def process_files():
    repo_path = "/content/GALAXY"
    output_csv = "/content/ui_components_dataset.csv"
    output_jsonl = "/content/ui_components_dataset.jsonl"

    data = []
    jsonl_data = []
    category_count = {}

    for category in sorted(os.listdir(repo_path)):
        category_path = os.path.join(repo_path, category)

        if os.path.isdir(category_path):
            category_count[category] = 0

            for file in sorted(os.listdir(category_path)):
                if file.endswith(".html"):
                    file_path = os.path.join(category_path, file)

                    with open(file_path, "r", encoding="utf-8") as f:
                        code = f.read()

                    ai_filename, ai_description = get_ai_filename_and_description(category, file, code)

                    # Make unique fallback in case AI filename is reused
                    category_count[category] += 1
                    if ai_filename == f"{category}_unknown.html":
                        ai_filename = f"{category}_{category_count[category]:02}.html"

                    new_file_path = os.path.join(category_path, ai_filename)

                    # Rename the file
                    shutil.move(file_path, new_file_path)

                    # Take cropped screenshot
                    screenshot_path = await capture_cropped_screenshot(new_file_path)

                    row = [
                        category,
                        code,
                        file,
                        ai_filename,
                        ai_description,
                        new_file_path,
                        screenshot_path
                    ]
                    data.append(row)

                    jsonl_data.append({
                        "component_category": category,
                        "code": code,
                        "original_file_name": file,
                        "ai_generated_file_name": ai_filename,
                        "ai_generated_description": ai_description,
                        "path": new_file_path,
                        "screenshot_image": screenshot_path
                    })

    # Write CSV
    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
            "Component Category",
            "Code",
            "File Name",
            "AI Generated Filename",
            "AI Generated Description",
            "Path",
            "Screenshot/Image"
        ])
        writer.writerows(data)

    # Write JSONL
    with open(output_jsonl, "w", encoding="utf-8") as f:
        for item in jsonl_data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    print(f"✅ CSV saved: {output_csv}")
    print(f"✅ JSONL saved: {output_jsonl}")

# 🔹 Run it
await process_files()

