In [5]:
! pip install requests beautifulsoup4




In [1]:
import os
import time
import json
import requests
from pathlib import Path
from duckduckgo_search import DDGS
from PIL import Image
from bs4 import BeautifulSoup

# Categories of computer parts with specific Newegg category URLs
computer_parts = {
    "CPU": "https://www.newegg.com/p/pl?d=CPU",
    "GPU": "https://www.newegg.com/p/pl?d=Graphics+Card",
    "Motherboard": "https://www.newegg.com/p/pl?d=Motherboard",
    "RAM": "https://www.newegg.com/p/pl?d=Memory",
    "SSD": "https://www.newegg.com/p/pl?d=Solid+State+Drive",
    "HDD": "https://www.newegg.com/p/pl?d=Hard+Drive",
    "Cooling System": "https://www.newegg.com/p/pl?d=Cooling+Fan",
    "Case": "https://www.newegg.com/p/pl?d=PC+Case"
}

# Base directory for storing images and JSON data
base_path = Path("computer-parts")
base_path.mkdir(exist_ok=True)

def download_images(dest, query, max_results=20):
    """Searches and downloads images from DuckDuckGo."""
    dest.mkdir(exist_ok=True, parents=True)
    with DDGS() as ddgs:
        results = list(ddgs.images(query, max_results=max_results))

    image_paths = []
    for i, result in enumerate(results):
        img_url = result.get("image", "")
        if not img_url:
            continue
        try:
            img_data = requests.get(img_url, timeout=10).content
            img_path = dest / f"{i}.jpg"
            with open(img_path, "wb") as f:
                f.write(img_data)
            print(f"Downloaded: {img_url}")
            image_paths.append(str(img_path))
        except Exception as e:
            print(f"Failed to download {img_url}: {e}")
    return image_paths

def resize_images(folder, max_size=400):
    """Resizes images in a folder to max_size pixels."""
    for img_file in folder.glob("*.jpg"):
        try:
            img = Image.open(img_file)
            img.thumbnail((max_size, max_size))
            img.save(img_file)
            print(f"Resized: {img_file}")
        except Exception as e:
            print(f"Failed to resize {img_file}: {e}")

def scrape_computer_parts():
    """Scrapes computer parts data from Newegg."""
    headers = {"User-Agent": "Mozilla/5.0"}
    
    parts_data = {}
    for part, search_url in computer_parts.items():
        print(f"Scraping data for {part}...")
        try:
            response = requests.get(search_url, headers=headers, timeout=10)
            soup = BeautifulSoup(response.text, "html.parser")
            items = soup.select("div.item-cell")[:20]  # Newegg uses 'div.item-cell' for product listings
            
            parts_data[part] = []
            for item in items:
                try:
                    brand_tag = item.select_one(".item-brand img")
                    brand = brand_tag["title"] if brand_tag else "Unknown"
                    
                    model_tag = item.select_one(".item-title")
                    model = model_tag.text.strip() if model_tag else "Unknown"
                    
                    price_tag = item.select_one(".price-current")
                    price = price_tag.text.strip() if price_tag else "Unknown"
                    
                    description = f"{model} from {brand} is a high-quality component built for optimal performance, efficiency, and durability. Ideal for gaming, professional workstations, and high-performance setups."
                    
                    specifications = []
                    spec_list = item.select("ul.item-features li")  # Newegg uses 'ul.item-features'
                    for spec in spec_list:
                        specifications.append(spec.text.strip())
                    
                    image_tag = item.select_one(".item-img img")
                    image_url = image_tag.get("src", "") if image_tag else ""
                    
                    parts_data[part].append({
                        "brand": brand,
                        "model": model,
                        "price": price,
                        "description": description,
                        "specifications": specifications if specifications else "Specifications not available.",
                        "image": image_url
                    })
                except Exception as e:
                    print(f"Error extracting data for {part}: {e}")
        except Exception as e:
            print(f"Failed to scrape {part}: {e}")
    
    return parts_data

# Scrape and save computer parts data
computer_parts_data = scrape_computer_parts()

# Loop through each computer part category
for part in computer_parts:
    part_path = base_path / part
    image_paths = download_images(part_path, query=f"{part} Computer Part", max_results=20)
    resize_images(part_path, max_size=400)
    
    # Assign images to data
    if part in computer_parts_data:
        for i, item in enumerate(computer_parts_data[part]):
            if i < len(image_paths):
                item["image"] = image_paths[i]
    
    time.sleep(3)

# Save updated data with images
json_path = base_path / "computer_parts.json"
with open(json_path, "w") as f:
    json.dump(computer_parts_data, f, indent=4)
print(f"Saved scraped parts data to {json_path}")

print("All images downloaded, resized, and data scraped!")

Scraping data for CPU...
Scraping data for GPU...
Scraping data for Motherboard...
Scraping data for RAM...
Scraping data for SSD...
Scraping data for HDD...
Scraping data for Cooling System...
Scraping data for Case...
Downloaded: https://www.techspot.com/articles-info/2000/images/2020-04-06-image-9.jpg
Downloaded: https://thumbs.dreamstime.com/z/detail-central-processor-unit-below-cpu-computer-part-69118567.jpg
Downloaded: https://i.insider.com/60402d8eb46d720018b04c1f?width=1200&format=jpeg
Downloaded: https://i.ytimg.com/vi/YRKYW98r5eo/maxresdefault.jpg
Downloaded: https://i1.wp.com/equilibriumest.com/wp-content/uploads/2017/07/beginners-guide-to-computer-hardware-computer-inside.jpg?ssl=1
Downloaded: http://finalproject9c.weebly.com/uploads/5/0/7/1/50711767/244027473.png
Downloaded: https://thumbs.dreamstime.com/z/internal-parts-pc-system-unit-background-look-inside-central-processing-computer-119635644.jpg
Downloaded: https://d1rwhvwstyk9gu.cloudfront.net/2019/11/CPU.jpg
Download