# The customer asked for two more things.
1) The images are too small they need to be 500x500px
2) Need the catalog the be in word format

The second of these is relatively easy - just ask ChatGPT to produce the output in .docx format. There is a library for that.

The first requires some re-thinking. Currently we are scraping Google's image search page. This page does not contain the images but rather thumbnails of the images. The first thought is to access the real image by following the link beck to the origional image. Modern web pages are heavily saturated with Java script making this tricky. Another way to approach this is to use asearch that does find the real images. Google, Microsoft and others provide such a tool but you need to sign up for their API and pay a fee.
I chose to go with SERP API.

A SERP API, or Search Engine Results Page API, is a tool that allows developers to programmatically access and retrieve data from search engine results pages (SERPs). Essentially, it's an automated way to scrape and parse the information displayed on a search engine results page, like Google or Bing, for a specific query. This data can include things like organic search results, ads, knowledge graphs, and other SERP features. 

Sign up for a free Serp AIP key at https://serpapi.com/
Note: The Serp API has a limited free offering so make sure not to make unnecessary calls.

In [None]:
!pip install python-docx requests beautifulsoup4 fpdf pillow pandas reportlab google-api-python-client selenium google-search-results

In [None]:
import os
import csv
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
import urllib.parse
import pandas as pd
from urllib.parse import urlparse
from serpapi import GoogleSearch
from requests.exceptions import SSLError

# Constants
DATA_FILE = "data/materials.csv"
IMAGE_DIR = "data/images/part5"
BRAND_IMAGE_DIR = os.path.join(IMAGE_DIR, "brands")
FABRIC_IMAGE_DIR = os.path.join(IMAGE_DIR, "fabrics")
OUTPUT_DOCX = "data/material_catalog_part5.docx"
SERPAPI_KEY = "359e12bf53e67836e1989b8edcd1e78d12020ff3d100ac33865e63eee371191b"  # Replace with your actual SerpAPI key

# Create directories
os.makedirs(BRAND_IMAGE_DIR, exist_ok=True)
os.makedirs(FABRIC_IMAGE_DIR, exist_ok=True)

def is_valid_image_url(url):
    domain = urlparse(url).netloc.lower()
    if "instagram.com" in domain:
        print(f"⛔ Skipping Instagram URL: {url}")
        return False
    return True

def fetch_image_url_google_scrape(query):
    headers = {"User-Agent": "Mozilla/5.0"}
    search_url = f"https://www.google.com/search?tbm=isch&q={urllib.parse.quote(query)}"
    try:
        response = requests.get(search_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        images = soup.find_all("img")

        for img in images:
            src = img.get("src")
            if src and src.startswith("http") and is_valid_image_url(src):
                return src
    except Exception as e:
        print(f"❌ Failed to fetch image for {query}: {e}")
    return None

def fetch_image_url_serpapi(query):
    try:
        search = GoogleSearch({
            "q": query,
            "tbm": "isch",
            "api_key": SERPAPI_KEY
        })
        results = search.get_dict()
        for img in results.get("images_results", []):
            url = img.get("original")
            if is_valid_image_url(url):
                try:
                    head = requests.head(url, timeout=5, allow_redirects=True)
                    if head.status_code == 200 and "image" in head.headers.get("Content-Type", ""):
                        return url
                except SSLError as ssl_err:
                    print(f"⛔ SSL error when checking: {url} — {ssl_err}")
                except Exception as e:
                    print(f"⚠️ Error checking image URL {url}: {e}")
    except Exception as e:
        print(f"❌ SerpAPI failed for {query}: {e}")
    return None

def download_image(url, save_path):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers, timeout=10)

        content_type = response.headers.get("Content-Type", "")
        if "image" not in content_type:
            print(f"⚠️ Not an image (Content-Type={content_type}): {url}")
            with open(save_path + ".html", "wb") as f:
                f.write(response.content)
            return None

        image = Image.open(BytesIO(response.content)).convert("RGB")
        image.save(save_path)
        return save_path
    except SSLError as ssl_err:
        print(f"⛔ SSL error for {url}: {ssl_err}")
        return None
    except Exception as e:
        print(f"⚠️ Failed to download image from {url}: {e}")
        return None

def create_docx_from_data(df):
    doc = Document()
    doc.add_heading("Fabric Catalog", 0)

    table = doc.add_table(rows=0, cols=2)
    table.autofit = True
    table.style = "Table Grid"

    for idx, row in df.iterrows():
        brand = row["Brand"]
        product_name = row["Product Name"]
        product_code = row["Product Code"]
        color = row["Color"]
        notes = row["Notes"]

        fabric_img_path = os.path.join(FABRIC_IMAGE_DIR, f"{brand}_{product_name}.jpg".replace(" ", "_"))
        brand_img_path = os.path.join(BRAND_IMAGE_DIR, f"{brand}.jpg".replace(" ", "_"))

        row_cells = table.add_row().cells

        if os.path.exists(fabric_img_path):
            paragraph = row_cells[0].paragraphs[0]
            run = paragraph.add_run()
            run.add_picture(fabric_img_path, width=Inches(1.5))
        else:
            row_cells[0].text = "No Image"

        if os.path.exists(brand_img_path):
            run = row_cells[1].paragraphs[0].add_run()
            run.add_picture(brand_img_path, width=Inches(1.0))

        row_cells[1].add_paragraph(f"Brand: {brand}")
        row_cells[1].add_paragraph(f"Product: {product_name}")
        row_cells[1].add_paragraph(f"Code: {product_code}")
        row_cells[1].add_paragraph(f"Color: {color}")
        row_cells[1].add_paragraph(f"Notes: {notes}")

        if (idx + 1) % 8 == 0:
            doc.add_paragraph("\n")

    doc.save(OUTPUT_DOCX)
    print(f"✅ DOCX saved to: {OUTPUT_DOCX}")

# Main Script
df = pd.read_csv(DATA_FILE)

for _, row in df.iterrows():
    brand = row["Brand"]
    product_name = row["Product Name"]
    product_code = row["Product Code"]
    color = row["Color"]

    brand_img_path = os.path.join(BRAND_IMAGE_DIR, f"{brand}.jpg".replace(" ", "_"))
    if not os.path.exists(brand_img_path):
        brand_url = fetch_image_url_google_scrape(f"{brand} logo")
        if brand_url:
            download_image(brand_url, brand_img_path)

    fabric_img_path = os.path.join(FABRIC_IMAGE_DIR, f"{brand}_{product_name}.jpg".replace(" ", "_"))
    if not os.path.exists(fabric_img_path):
        query_parts = [brand, product_name]
        if product_code.lower() != "unknown":
            query_parts.append(product_code)
        if color.lower() != "unknown":
            query_parts.append(color)
        fabric_query = " ".join(query_parts)

        fabric_url = fetch_image_url_serpapi(fabric_query)
        if fabric_url:
            print(f"📥 Downloading image for: {fabric_query}")
            print(f"🔗 Image URL: {fabric_url}")
            download_image(fabric_url, fabric_img_path)

create_docx_from_data(df)