## First, we crop the part of the image we need (based on product descriptiom) with Gdino and Bart

In [None]:
import requests
import torch
from PIL import Image
import pandas as pd
import os
import torchvision.transforms as transforms
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
import matplotlib.pyplot as plt
import re
from transformers import pipeline



# Function to clean filenames
def clean_filename(title):
    return re.sub(r'[<>:"/\\|?*]', '', title).replace(" ", "_")
# Load Grounding DINO Model
model_id = "IDEA-Research/grounding-dino-base"
device = "cuda" if torch.cuda.is_available() else "cpu"
device= "cpu" # Set to cpu for now
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

# Load CSV
csv_file = "hedomak_products.csv"
df = pd.read_csv(csv_file)

# Directory for cropped images
output_dir = "cropped_images"
os.makedirs(output_dir, exist_ok=True)

# Load Zero-shot Classification Model 
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", framework="pt", device = -1)

contextualized_labels = {
    "topwear": "Clothing worn on the upper body, such as shirts, jackets, sweaters, hoodies.",
    "bottomwear": "Clothing worn on the lower body, such as jeans, trousers, shorts, pants, leggings, denim.",
    "full outfit": "A full set of clothing, including both topwear and bottomwear."
}



# Function to determine cropping category -> consider using Ai (BART)
def get_cropping_label(description, title):
        
        # Labels for classification
        labels = list(contextualized_labels.values())

        result = classifier(title, labels)
        predicted_value = result["labels"][0]  # Get the highest confidence label
        predicted_category = [key for key, value in contextualized_labels.items() if value == predicted_value][0]
        score = result["scores"][0]
        if score > 0.85:
            return predicted_category
        
        #Try the description
        if pd.isna(description) or not isinstance(description, str) or description.strip() == "":
            return "full outfit"
        result = classifier(description, labels)
        predicted_value = result["labels"][0]  # Get the highest confidence label
        predicted_category = [key for key, value in contextualized_labels.items() if value == predicted_value][0]
        score = result["scores"][0]
        if score > 0.6:
            return predicted_category
        
        
        return "full outfit"

# Function to process and crop image
def process_image(image_url, product_title, description):
    try:
        # Load image
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            image = Image.open(response.raw).convert("RGB")
        else:
            print(f"Failed to download {image_url}")
            return None

        original_image = image.copy()

        cropping_label = get_cropping_label(description, product_title)
        
        def get_text_labels(cropping_label):
            if cropping_label == "topwear":
                return "top wear, t-shirt, hoodie, jacket, sweater, shirt"
            elif cropping_label == "bottomwear":
                return "bottom wear, pants, jeans, trousers, shorts, leggings"
            else:
                return None

        # Determine cropping type
        text_labels = get_text_labels(cropping_label) 

        if text_labels is None:
            print(f"Taking whole fit for {product_title}")
            original_image_path = os.path.join(output_dir, f"{clean_filename(product_title)}.jpg")
            original_image.save(original_image_path)
            return original_image_path

        # Run model
        inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)

        # Post-process results
        results = processor.post_process_grounded_object_detection(
            outputs,
            inputs.input_ids,
            box_threshold=0.2,
            text_threshold=0.2,
            target_sizes=[image.size[::-1]]
        )

        # If detection found
        if len(results[0]["boxes"]) > 0:
            max_index = results[0]["scores"].argmax()
            box = [int(round(x)) for x in results[0]["boxes"][max_index].tolist()]

            # Crop the detected object
            cropped_image = original_image.crop((box[0], box[1], box[2], box[3]))

            # Save cropped image
            cropped_image_path = os.path.join(output_dir, f"{clean_filename(product_title)}.jpg")
            cropped_image.save(cropped_image_path)

            return cropped_image_path
        else:
            print(f"No objects detected for {product_title}")
            original_image_path = os.path.join(output_dir, f"{clean_filename(product_title)}.jpg")
            original_image.save(original_image_path)
            return original_image_path

    except Exception as e:
        print(f"Error processing {product_title}: {e}")
        return None

# Process all products in the CSV
for index, row in df.iterrows():
    image_url = row["image_url"]
    product_title = row["title"]
    description = row["description"]

    new_image_path = process_image(image_url, product_title, description)
    
    if new_image_path:
        df.at[index, "cropped_image_path"] = new_image_path  # Save new image path
    


df.to_csv("hedomak_products.csv", index=False)

print("Processing complete. Cropped images saved & CSV updated.")


## Now lets generate detailed descriptions for these cropped images using LLava 1.6 13b 

In [None]:
import ollama
from PIL import Image
from io import BytesIO
from pathlib import Path
import pandas as pd
# ------------------------
# CONFIG
# ------------------------
FILE_DIR = Path(r"C:\projects\hedomak\hedomak_products.csv")
df = pd.read_csv(FILE_DIR)

#Remove
df['generated_desc'] = ""
df.to_csv(FILE_DIR, index=False)
#######

if 'generated_desc' not in df.columns:
    df['generated_desc'] = ""

# ------------------------
# GENERATION FUNCTION
# ------------------------
def generate_description(image_path):
    """Read an image from local path and generate a detailed description using LLaVA."""
    try:
        # Open the image and convert it to bytes
        img = Image.open(image_path).convert("RGB")
        with BytesIO() as buffer:
            img.save(buffer, format='PNG')
            image_bytes = buffer.getvalue()

        # Prompt for LLaVA
        prompt = (
        "You are describing an apparel item for a fashion search index. "
        "Return ONE LINE under 30 words, exactly in this order: "
        "type; fit; length; material; primary color; texture/fabric feel; pattern/print; sleeve type; neckline/collar; rise/waistline; closure type; notable details; seasonality; style/occasion tags.\n"
        "If you are not sure about any attribute, write 'unknown' instead of leaving it blank. "
        "Do not add extra commentary.\n"
        "Examples:\n"
        "jeans; relaxed; full-length; rigid denim; black; matte; solid; no sleeves; n/a; high-rise; button fly; clean hem; all-season; casual,minimal\n"
        "hoodie; oversized; hip-length; cotton blend; heather grey; soft fleece; solid; long sleeves; hood; n/a; pullover; kangaroo pocket; winter; streetwear,casual\n"
        "Now describe the item in the image:"
        )




        # Generate caption
        full_response = ''
        for response in ollama.generate(
            model='llava:7b-v1.6',
            prompt=prompt,
            images=[image_bytes],
            stream=True,
            options = {
            "num_gpu": 0,          # CPU-only if you need it
            "temperature": 0.2,    # concise & consistent
            "top_p": 0.9,          # a touch of variety without drift
            "repeat_penalty": 1.05,# reduces duplicate phrases
            "num_predict": 80,     # single-line cap
            "stop": ["\n"],        # cut at first newline (enforces one line)
            "keep_alive": "10m",   # keep model warm between calls
            
            }
        ):
            full_response += response['response']

        return full_response.strip()

    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# ------------------------
# LOOP OVER FOLDER
# ------------------------

#desc = generate_description(img_path)
x = 0
for idx, row in df.iterrows():
    if pd.isna(row['generated_desc']) or str(row['generated_desc']).strip() == "":
        desc = generate_description(df.at[idx, 'cropped_image_path'])
        print(f"{desc} \n{'-'*50}")
        df.at[idx, 'generated_desc'] = desc
        x+=1
        if x%5==0:
            df.to_csv(FILE_DIR, index=False)

