In [1]:
import pandas as pd
import boto3
import os
from urllib.parse import urlparse
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

query_for_data = """
WITH ranked_replacements AS (
    SELECT
        dvss1.DEAL_VOUCHER_SUB_CATEGORY_ID,
        dvsc.name AS sub_category_name,
        dvc.name AS category_name,

        dvss1.SITE_ID AS missing_image_site_id,
        sm.site_name AS missing_site_name,
        dvss1.IMAGE_URL AS missing_image_url,

        dvss2.SITE_ID AS available_image_site_id,
        sa.site_name AS available_site_name,
        dvss2.IMAGE_URL AS available_image_url,

        ROW_NUMBER() OVER (
            PARTITION BY dvss1.DEAL_VOUCHER_SUB_CATEGORY_ID, dvss1.SITE_ID
            ORDER BY ABS(dvss1.SITE_ID - dvss2.SITE_ID)
        ) AS rn
    FROM
        DEAL_VOUCHER_SUB_CATEGORY_SITE dvss1
    LEFT JOIN
        DEAL_VOUCHER_SUB_CATEGORY_SITE dvss2
        ON dvss1.DEAL_VOUCHER_SUB_CATEGORY_ID = dvss2.DEAL_VOUCHER_SUB_CATEGORY_ID
        AND dvss1.SITE_ID <> dvss2.SITE_ID
        AND dvss2.IMAGE_URL IS NOT NULL
    JOIN deal_voucher_sub_category dvsc ON dvsc.id = dvss1.DEAL_VOUCHER_SUB_CATEGORY_ID
    JOIN deal_voucher_category dvc ON dvc.id = dvsc.parent_id
    LEFT JOIN site sm ON sm.id = dvss1.SITE_ID
    LEFT JOIN site sa ON sa.id = dvss2.SITE_ID

    WHERE
        dvss1.IMAGE_URL IS NULL
    and dvss1.DEAL_VOUCHER_SUB_CATEGORY_ID = 231
),
SELECT
    DEAL_VOUCHER_SUB_CATEGORY_ID,
    sub_category_name,
    category_name,
    missing_image_site_id,
    missing_site_name,
    missing_image_url,
    available_image_site_id,
    available_site_name,
    available_image_url
FROM
    ranked_replacements
WHERE
    rn = 1
order by sub_category_name;
"""

# Read the CSV
df = pd.read_csv('Result_14.csv')

df['AVAILABLE_IMAGE_SITE_ID'] = df['AVAILABLE_IMAGE_SITE_ID'].astype('Int64')
# Filter rows with missing images but available images
missing_with_available = df[
    (df['MISSING_IMAGE_SITE_ID'].notna()) & 
    (df['AVAILABLE_IMAGE_SITE_ID'].notna()) & 
    (df['AVAILABLE_IMAGE_URL'].notna()) &
    (df['AVAILABLE_IMAGE_URL'] != '')
]

print(f"Found {len(missing_with_available)} rows with missing images that have available images")
# Thread-safe counter and log
success_count = 0
pairings_log = []
lock = threading.Lock()

def copy_single_image(row):
    global success_count, pairings_log
    
    sub_category_id = row['DEAL_VOUCHER_SUB_CATEGORY_ID']
    missing_site_id = row['MISSING_IMAGE_SITE_ID']
    available_url = row['AVAILABLE_IMAGE_URL']
    
    # Generate the new URL for the missing site
    new_url = f"https://static.wowcher.co.uk/images/seo/subcategory/{sub_category_id}-{missing_site_id}.jpg"
    
    # Extract S3 key from the new URL
    parsed_url = urlparse(new_url)
    s3_key = parsed_url.path.lstrip('/')
    
    try:
        # Initialize S3 client for this thread
        s3_client = boto3.client('s3')
        
        # Copy image directly using S3 copy
        copy_source = {
            'Bucket': 'static.wowcher.co.uk',
            'Key': urlparse(available_url).path.lstrip('/')
        }
        
        s3_client.copy(
            copy_source, 
            'static.wowcher.co.uk', 
            s3_key,
            ExtraArgs={
                'CacheControl': 'no-cache',
                'ContentType': 'image/jpeg'
            }
        )
        
        with lock:
            pairings_log.append({
                'sub_category_id': sub_category_id,
                'site_id': missing_site_id,
                'source_url': available_url,
                'destination_url': new_url,
                'status': 'success'
            })
            success_count += 1
        
        return True
    except Exception as e:
        with lock:
            pairings_log.append({
                'sub_category_id': sub_category_id,
                'site_id': missing_site_id,
                'source_url': available_url,
                'destination_url': new_url,
                'status': f'failed: {str(e)}'
            })
            print(f"Failed to copy image for sub_category_id={sub_category_id}, site_id={missing_site_id}: {e}")
        
        return False

# Process with 100 workers
with ThreadPoolExecutor(max_workers=100) as executor:
    # Submit all tasks
    future_to_row = {executor.submit(copy_single_image, row): row for _, row in missing_with_available.iterrows()}
    
    # Wait for completion
    for future in as_completed(future_to_row):
        try:
            future.result()
        except Exception as e:
            print(f"Worker thread failed: {e}")

# Save pairings log to CSV
pairings_df = pd.DataFrame(pairings_log)
log_filename = f'image_copy_pairings_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
pairings_df.to_csv(log_filename, index=False)

print(f"Summary: {success_count}/{len(missing_with_available)} images successfully copied")
print(f"Pairings log saved to {log_filename}")

Found 0 rows with missing images that have available images
Summary: 0/0 images successfully copied
Pairings log saved to image_copy_pairings_20250804_115656.csv


In [10]:
import pandas as pd
import os
from datetime import datetime
df = pd.read_csv('image_copy_pairings_20250729_113645.csv')

def generate_sql_updates(pairings_df, output_file=None):
    """
    Generate SQL UPDATE statements for the image pairings.
    
    Args:
        pairings_df: DataFrame with columns ['sub_category_id', 'site_id', 'destination_url', 'status']
        output_file: Optional file path to save SQL statements
    """
    
    # Filter only successful uploads
    successful_uploads = pairings_df[pairings_df['status'] == 'success'].copy()
    
    if successful_uploads.empty:
        print("No successful uploads found to generate SQL for.")
        return
    
    # Group by sub_category_id to create IN clauses
    sql_statements = []
    
    for sub_category_id, group in successful_uploads.groupby('sub_category_id'):
        site_ids = group['site_id'].tolist()
        
        # Create the SQL statement with dynamic URL construction
        site_ids_str = ','.join(map(str, site_ids))
        sql = f"""UPDATE DEAL_VOUCHER_SUB_CATEGORY_SITE 
SET IMAGE_URL = 'https://static.wowcher.co.uk/images/seo/subcategory/{sub_category_id}-'|| SITE_ID||'.jpg' 
WHERE DEAL_VOUCHER_SUB_CATEGORY_ID = {sub_category_id} 
AND SITE_ID IN ({site_ids_str});"""
        
        sql_statements.append(sql)
    
    # Print all SQL statements
    print(f"Generated {len(sql_statements)} SQL UPDATE statements:")
    print("=" * 80)
    
    for i, sql in enumerate(sql_statements, 1):
        print(f"-- Statement {i}")
        print(sql)
        print()
    
    # Save to file if specified
    if output_file:
        with open(output_file, 'w') as f:
            f.write(f"-- SQL UPDATE statements generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"-- Total statements: {len(sql_statements)}\n")
            f.write("=" * 80 + "\n\n")
            
            for i, sql in enumerate(sql_statements, 1):
                f.write(f"-- Statement {i}\n")
                f.write(sql + "\n\n")
        
        print(f"SQL statements saved to: {output_file}")
    
    return sql_statements

# Generate SQL for the copied images process
# Use the pairings_df from your current run
if 'df' in locals():
    generate_sql_updates(df, 'sql_updates_copied_images.sql')
else:
    print("No pairings_df found. Run the image copy process first.")



Generated 174 SQL UPDATE statements:
-- Statement 1
UPDATE DEAL_VOUCHER_SUB_CATEGORY_SITE 
SET IMAGE_URL = 'https://static.wowcher.co.uk/images/seo/subcategory/6-'|| SITE_ID||'.jpg' 
WHERE DEAL_VOUCHER_SUB_CATEGORY_ID = 6 
AND SITE_ID IN (275849);

-- Statement 2
UPDATE DEAL_VOUCHER_SUB_CATEGORY_SITE 
SET IMAGE_URL = 'https://static.wowcher.co.uk/images/seo/subcategory/8-'|| SITE_ID||'.jpg' 
WHERE DEAL_VOUCHER_SUB_CATEGORY_ID = 8 
AND SITE_ID IN (275849);

-- Statement 3
UPDATE DEAL_VOUCHER_SUB_CATEGORY_SITE 
SET IMAGE_URL = 'https://static.wowcher.co.uk/images/seo/subcategory/12-'|| SITE_ID||'.jpg' 
WHERE DEAL_VOUCHER_SUB_CATEGORY_ID = 12 
AND SITE_ID IN (275849);

-- Statement 4
UPDATE DEAL_VOUCHER_SUB_CATEGORY_SITE 
SET IMAGE_URL = 'https://static.wowcher.co.uk/images/seo/subcategory/15-'|| SITE_ID||'.jpg' 
WHERE DEAL_VOUCHER_SUB_CATEGORY_ID = 15 
AND SITE_ID IN (275849);

-- Statement 5
UPDATE DEAL_VOUCHER_SUB_CATEGORY_SITE 
SET IMAGE_URL = 'https://static.wowcher.co.uk/images/seo/

In [2]:
import pandas as pd
import boto3
import requests
import os
from urllib.parse import urlparse
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from openai import OpenAI

load_dotenv()

print(os.getenv('OPEN_AI_API_KEY'))

NameError: name 'load_dotenv' is not defined

In [6]:
import pandas as pd
import boto3
import requests
import os
from urllib.parse import urlparse
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from openai import OpenAI
import base64

# Read the CSV
df = pd.read_csv('/Users/elliottoates/Desktop/streamlit-image-review/Result_14.csv')
#df['AVAILABLE_IMAGE_SITE_ID'] = df['AVAILABLE_IMAGE_SITE_ID'].astype('Int64')

# Find subcategories with no available images at all
subcategories_with_no_images = df[
    (df['MISSING_IMAGE_SITE_ID'].notna()) & 
    (df['AVAILABLE_IMAGE_SITE_ID'].isna()) & 
    (df['AVAILABLE_IMAGE_URL'].isna() | (df['AVAILABLE_IMAGE_URL'] == ''))
]

# Get unique subcategory-category pairs that need images
unique_subcategories = subcategories_with_no_images[['DEAL_VOUCHER_SUB_CATEGORY_ID', 'SUB_CATEGORY_NAME', 'CATEGORY_NAME']].drop_duplicates()

# Filter out rows where sub_category_name contains 'DONOT'
unique_subcategories = unique_subcategories[~unique_subcategories['SUB_CATEGORY_NAME'].str.contains('DONOT', case=False, na=False)]

print(f"Found {len(unique_subcategories)} unique subcategories that need new images generated")

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv('OPEN_AI_API_KEY'))

# Thread-safe counter and log
success_count = 0
pairings_log = []
lock = threading.Lock()

def generate_and_distribute_image(row):
    global success_count, pairings_log
    
    sub_category_id = row['DEAL_VOUCHER_SUB_CATEGORY_ID']
    sub_category_name = row['SUB_CATEGORY_NAME']
    category_name = row['CATEGORY_NAME']
    
    try:
        # Generate image with OpenAI
        prompt = f"can you make an image for my {sub_category_name} subcategory button on my eccommerce website. The subcategory falls under the {category_name} category. Photorealistic please. No text. Leave decent space between main subject and edge of images."
        
        response = client.images.generate(
            model="gpt-image-1",
            prompt=prompt,
            n=1,
            size="1024x1024",
            quality="medium",
        )
        
        # Decode base64 image data
        image_data = base64.b64decode(response.data[0].b64_json)
        print(f"Image generated for sub_category_id={sub_category_id}")

        # Save the generated image
        temp_file = f"generated_image_{sub_category_id}_{threading.get_ident()}.jpg"
        
        with open(temp_file, 'wb') as f:
            f.write(image_data)
        
        # Initialize S3 client for this thread
        s3_client = boto3.client('s3')
        
        # Find all missing site IDs for this subcategory
        missing_sites = subcategories_with_no_images[
            subcategories_with_no_images['DEAL_VOUCHER_SUB_CATEGORY_ID'] == sub_category_id
        ]['MISSING_IMAGE_SITE_ID'].unique()
        
        uploaded_count = 0
        for site_id in missing_sites:
            # Generate the new URL for the missing site
            new_url = f"https://static.wowcher.co.uk/images/seo/subcategory/{sub_category_id}-{site_id}.jpg"
            
            # Extract S3 key from the new URL
            parsed_url = urlparse(new_url)
            s3_key = parsed_url.path.lstrip('/')
            
            # Upload to S3 with cache control and content type
            s3_client.upload_file(
                temp_file, 
                'static.wowcher.co.uk', 
                s3_key,
                ExtraArgs={
                    'CacheControl': 'no-cache',
                    'ContentType': 'image/jpeg'
                }
            )
            
            with lock:
                pairings_log.append({
                    'sub_category_id': sub_category_id,
                    'sub_category_name': sub_category_name,
                    'category_name': category_name,
                    'site_id': site_id,
                    'source_url': 'generated_image',
                    'destination_url': new_url,
                    'status': 'success'
                })
                uploaded_count += 1
        
        if uploaded_count > 0:
            print(f"Image distributed to {uploaded_count} site(s) for sub_category_id={sub_category_id}")
        
        success_count += uploaded_count
        
        return True
        
    except Exception as e:
        with lock:
            pairings_log.append({
                'sub_category_id': sub_category_id,
                'sub_category_name': sub_category_name,
                'category_name': category_name,
                'site_id': 'N/A',
                'source_url': 'N/A',
                'destination_url': 'N/A',
                'status': f'failed: {str(e)}'
            })
            print(f"Failed to generate/upload image for sub_category_id={sub_category_id}: {e}")
        
        return False
    
    finally:
        # Clean up temp file
        if os.path.exists(temp_file):
            os.remove(temp_file)

# Process with 10 workers (fewer for API calls)
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit all tasks
    future_to_row = {executor.submit(generate_and_distribute_image, row): row for _, row in unique_subcategories.iterrows()}
    
    # Wait for completion
    for future in as_completed(future_to_row):
        try:
            future.result()
        except Exception as e:
            print(f"Worker thread failed: {e}")

# Save pairings log to CSV
pairings_df = pd.DataFrame(pairings_log)
log_filename = f'generated_image_pairings_noSub_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
pairings_df.to_csv(log_filename, index=False)

print(f"Summary: {success_count} images successfully uploaded")
print(f"Pairings log saved to {log_filename}")

Found 1 unique subcategories that need new images generated
Image generated for sub_category_id=240
Image distributed to 106 site(s) for sub_category_id=240
Summary: 106 images successfully uploaded
Pairings log saved to generated_image_pairings_noSub_20250804_120204.csv


In [7]:
import pandas as pd
import os
from datetime import datetime
import pandas as pd
import os
from datetime import datetime

def generate_sql_updates(pairings_df, output_file=None):
    """
    Generate SQL UPDATE statements for the image pairings.
    
    Args:
        pairings_df: DataFrame with columns ['sub_category_id', 'site_id', 'destination_url', 'status']
        output_file: Optional file path to save SQL statements
    """
    
    # Filter only successful uploads
    successful_uploads = pairings_df[pairings_df['status'] == 'success'].copy()
    
    if successful_uploads.empty:
        print("No successful uploads found to generate SQL for.")
        return
    
    # Group by sub_category_id to create IN clauses
    sql_statements = []
    
    for sub_category_id, group in successful_uploads.groupby('sub_category_id'):
        site_ids = group['site_id'].tolist()
        
        # Create the SQL statement with dynamic URL construction
        site_ids_str = ','.join(map(str, site_ids))
        sql = f"""UPDATE DEAL_VOUCHER_SUB_CATEGORY_SITE 
SET IMAGE_URL = 'https://static.wowcher.co.uk/images/seo/subcategory/{sub_category_id}-'|| SITE_ID||'.jpg' 
WHERE DEAL_VOUCHER_SUB_CATEGORY_ID = {sub_category_id} 
AND SITE_ID IN ({site_ids_str});"""
        
        sql_statements.append(sql)
    
    # Print all SQL statements
    print(f"Generated {len(sql_statements)} SQL UPDATE statements:")
    print("=" * 80)
    
    for i, sql in enumerate(sql_statements, 1):
        print(f"-- Statement {i}")
        print(sql)
        print()
    
    # Save to file if specified
    if output_file:
        with open(output_file, 'w') as f:
            f.write(f"-- SQL UPDATE statements generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"-- Total statements: {len(sql_statements)}\n")
            f.write("=" * 80 + "\n\n")
            
            for i, sql in enumerate(sql_statements, 1):
                f.write(f"-- Statement {i}\n")
                f.write(sql + "\n\n")
        
        print(f"SQL statements saved to: {output_file}")
    
    return sql_statements

generate_sql_updates(pairings_df, 'sql_updates.sql')

Generated 1 SQL UPDATE statements:
-- Statement 1
UPDATE DEAL_VOUCHER_SUB_CATEGORY_SITE 
SET IMAGE_URL = 'https://static.wowcher.co.uk/images/seo/subcategory/240-'|| SITE_ID||'.jpg' 
WHERE DEAL_VOUCHER_SUB_CATEGORY_ID = 240 
AND SITE_ID IN (275774,275775,275776,275782,275783,275784,275785,275786,275787,275789,275790,275791,275797,275850,275851,275852,275855,275856,275857,275858,275859,275860,275861,275863,275864,275865,275866,275867,275868,275869,275870,275871,275872,275873,275874,275875,275876,275878,275879,275880,275883,275884,275885,275888,275889,275890,275892,275893,276146,276147,276148,276149,276158,276543,276545,276546,276547,276548,276550,276551,276552,276553,276554,276555,276556,276557,276558,276559,276560,276561,276562,276563,276564,276565,276566,276567,276568,276569,276570,276571,276572,276573,276574,276575,276576,276577,276578,276583,276584,276585,276586,276587,276588,276589,276590,276591,276592,276593,276594,276595,276596,276597,276598,276599,276602,276603);

SQL statements

["UPDATE DEAL_VOUCHER_SUB_CATEGORY_SITE \nSET IMAGE_URL = 'https://static.wowcher.co.uk/images/seo/subcategory/240-'|| SITE_ID||'.jpg' \nWHERE DEAL_VOUCHER_SUB_CATEGORY_ID = 240 \nAND SITE_ID IN (275774,275775,275776,275782,275783,275784,275785,275786,275787,275789,275790,275791,275797,275850,275851,275852,275855,275856,275857,275858,275859,275860,275861,275863,275864,275865,275866,275867,275868,275869,275870,275871,275872,275873,275874,275875,275876,275878,275879,275880,275883,275884,275885,275888,275889,275890,275892,275893,276146,276147,276148,276149,276158,276543,276545,276546,276547,276548,276550,276551,276552,276553,276554,276555,276556,276557,276558,276559,276560,276561,276562,276563,276564,276565,276566,276567,276568,276569,276570,276571,276572,276573,276574,276575,276576,276577,276578,276583,276584,276585,276586,276587,276588,276589,276590,276591,276592,276593,276594,276595,276596,276597,276598,276599,276602,276603);"]

In [None]:
import pandas as pd
import os
from pathlib import Path
import sys

# Read the deals CSV using absolute paths
df = pd.read_csv('/Users/elliottoates/Desktop/streamlit-image-review/stonegate/deals.csv')
dictionary = df.to_dict(orient='records')

# Get the list of image folders using absolute paths
images_dir = Path('/Users/elliottoates/Desktop/streamlit-image-review/stonegate/Images')
image_folders = [folder.name for folder in images_dir.iterdir() if folder.is_dir()]

# Function to normalize text for matching (remove spaces, underscores, and convert to lowercase)
def normalize_text(text):
    return text.strip().lower().replace(' ', '').replace('_', '')

# Function to find matching image folder for a pub_name
def find_image_folder(pub_name, image_folders):
    """
    Find the best matching image folder for a pub_name.
    Returns the folder path if found, None otherwise.
    """
    # Normalize pub_name
    normalized_pub_name = normalize_text(pub_name)
    
    # Create a mapping of normalized folder names to actual folder paths
    folder_mapping = {}
    for folder in image_folders:
        normalized_folder = normalize_text(folder)
        folder_mapping[normalized_folder] = str(images_dir / folder)
    
    # Direct match with normalized names
    if normalized_pub_name in folder_mapping:
        return folder_mapping[normalized_pub_name]
    
    # Partial match (if pub_name contains folder name or vice versa)
    for normalized_folder, folder_path in folder_mapping.items():
        if (normalized_pub_name in normalized_folder or 
            normalized_folder in normalized_pub_name):
            return folder_path
    
    return None

# Function to get image priority score for sorting
def get_image_priority(image_path):
    """
    Get priority score for image sorting.
    Lower score = higher priority.
    """
    filename = os.path.basename(image_path).lower()
    
    # Priority 0: Ends with "main" (highest priority)
    if filename.endswith('main'):
        return 0
    
    # Priority 1: Contains "main" but doesn't end with it
    if 'main' in filename and not filename.endswith('main'):
        return 1
    
    # Priority 2: All other images (will be sorted by quality later)
    return 2

# Function to estimate image quality based on file size
def get_image_quality_score(image_path):
    """
    Estimate image quality based on file size.
    Larger files generally mean higher quality.
    """
    try:
        file_size = os.path.getsize(image_path)
        return file_size
    except:
        return 0

# Function to get image files from a folder with relative paths and proper sorting
def get_images_from_folder(folder_path):
    """
    Get all image files from a folder, sorted by priority and quality.
    Returns a list of relative image file paths.
    """
    if not folder_path or not os.path.exists(folder_path):
        return []
    
    image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
    image_files = []
    
    try:
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            if os.path.isfile(file_path):
                file_ext = os.path.splitext(file)[1].lower()
                if file_ext in image_extensions:
                    # Create relative path: Images/folder_name/filename
                    folder_name = os.path.basename(folder_path)
                    relative_path = f"Images/{folder_name}/{file}"
                    image_files.append(relative_path)
    except Exception as e:
        print(f"Error reading folder {folder_path}: {e}")
    
    # Sort images by priority and quality
    def sort_key(image_path):
        priority = get_image_priority(image_path)
        # Get the full path for quality scoring
        full_image_path = os.path.join(folder_path, os.path.basename(image_path))
        quality = get_image_quality_score(full_image_path)
        return (priority, -quality)  # Negative quality so larger files come first
    
    return sorted(image_files, key=sort_key)

# Add image folder paths and image files to each deal in the dictionary
for deal in dictionary:
    deal['image_folder_path'] = find_image_folder(deal['pub_name'], image_folders)
    deal['image_files'] = get_images_from_folder(deal['image_folder_path'])

# Get all deals without images
deals_without_images = [deal for deal in dictionary if not deal['image_files']]

# Print count of deals without images
print(f"Total deals without images: {len(deals_without_images)}")

# Print distinct pub names and count of pubs without images
distinct_pub_names_without_images = sorted(set(deal['pub_name'] for deal in deals_without_images))
print(f"Distinct pub names without images ({len(distinct_pub_names_without_images)} total):")
print("=" * 50)

# Print just the distinct pub names
for pub_name in distinct_pub_names_without_images:
    print(pub_name)

# Show example of sorted images for a deal with images
print("\n" + "=" * 50)
print("Example of sorted images for a deal with images:")
for deal in dictionary:
    if deal['image_files']:
        print(f"\nDeal: {deal['pub_name']}")
        print(f"Images ({len(deal['image_files'])} total):")
        for i, img in enumerate(deal['image_files'][:5]):  # Show first 5
            priority = get_image_priority(img)
            print(f"  Position {i}: {os.path.basename(img)} (Priority: {priority})")
        break

Total deals without images: 36
Distinct pub names without images (18 total):
Gassys Cardiff
Prince Of Teck Earls Court
Tank & Paddle Manchester Printworks
Temperance Fulham
The Bay & Bracket Victoria
The Block & Gasket Burgess Hill
The Block & Gasket Sale
The Boundary Reading
The Bridge Tap London
The Cannick Tapps London
The Centurion Colchester
The Cider Press Bristol
The Distillery Leicester
The Dockyard Portsmouth
The Faraday Epsom
The Garratt And Gauge Wimbledon
The Green Dragon Croydon
The Joiners London

Example of sorted images for a deal with images:


In [None]:
import pandas as pd
import os
from pathlib import Path
import sys

# Read the deals CSV using absolute paths
df = pd.read_csv('/Users/elliottoates/Desktop/streamlit-image-review/stonegate/deals.csv')
dictionary = df.to_dict(orient='records')

# Get the list of image folders using absolute paths
images_dir = Path('/Users/elliottoates/Desktop/streamlit-image-review/stonegate/Images')
image_folders = [folder.name for folder in images_dir.iterdir() if folder.is_dir()]

# Function to normalize text for matching (remove spaces, underscores, and convert to lowercase)
def normalize_text(text):
    return text.strip().lower().replace(' ', '').replace('_', '')

# Function to find matching image folder for a pub_name
def find_image_folder(pub_name, image_folders):
    """
    Find the best matching image folder for a pub_name.
    Returns the folder path if found, None otherwise.
    """
    # Normalize pub_name
    normalized_pub_name = normalize_text(pub_name)
    
    # Create a mapping of normalized folder names to actual folder paths
    folder_mapping = {}
    for folder in image_folders:
        normalized_folder = normalize_text(folder)
        folder_mapping[normalized_folder] = str(images_dir / folder)
    
    # Direct match with normalized names
    if normalized_pub_name in folder_mapping:
        return folder_mapping[normalized_pub_name]
    
    # Partial match (if pub_name contains folder name or vice versa)
    for normalized_folder, folder_path in folder_mapping.items():
        if (normalized_pub_name in normalized_folder or 
            normalized_folder in normalized_pub_name):
            return folder_path
    
    return None

# Function to get image priority score for sorting
def get_image_priority(image_path):
    """
    Get priority score for image sorting.
    Lower score = higher priority.
    """
    filename = os.path.basename(image_path).lower()
    
    # Priority 0: Ends with "main" (highest priority)
    if filename.endswith('main'):
        return 0
    
    # Priority 1: Contains "main" but doesn't end with it
    if 'main' in filename and not filename.endswith('main'):
        return 1
    
    # Priority 2: All other images (will be sorted by quality later)
    return 2

# Function to estimate image quality based on file size
def get_image_quality_score(image_path):
    """
    Estimate image quality based on file size.
    Larger files generally mean higher quality.
    """
    try:
        file_size = os.path.getsize(image_path)
        return file_size
    except:
        return 0

# Function to get image files from a folder with relative paths and proper sorting
def get_images_from_folder(folder_path):
    """
    Get all image files from a folder, sorted by priority and quality.
    Returns a list of relative image file paths.
    """
    if not folder_path or not os.path.exists(folder_path):
        return []
    
    image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
    image_files = []
    
    try:
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            if os.path.isfile(file_path):
                file_ext = os.path.splitext(file)[1].lower()
                if file_ext in image_extensions:
                    # Create relative path: Images/folder_name/filename
                    folder_name = os.path.basename(folder_path)
                    relative_path = f"Images/{folder_name}/{file}"
                    image_files.append(relative_path)
    except Exception as e:
        print(f"Error reading folder {folder_path}: {e}")
    
    # Sort images by priority and quality
    def sort_key(image_path):
        priority = get_image_priority(image_path)
        # Get the full path for quality scoring
        full_image_path = os.path.join(folder_path, os.path.basename(image_path))
        quality = get_image_quality_score(full_image_path)
        return (priority, -quality)  # Negative quality so larger files come first
    
    return sorted(image_files, key=sort_key)

# Add image folder paths and image files to each deal in the dictionary
for deal in dictionary:
    deal['image_folder_path'] = find_image_folder(deal['pub_name'], image_folders)
    deal['image_files'] = get_images_from_folder(deal['image_folder_path'])

# Get all deals without images
deals_without_images = [deal for deal in dictionary if not deal['image_files']]

# Print count of deals without images
print(f"Total deals without images: {len(deals_without_images)}")

# Print distinct pub names and count of pubs without images
distinct_pub_names_without_images = sorted(set(deal['pub_name'] for deal in deals_without_images))
print(f"Distinct pub names without images ({len(distinct_pub_names_without_images)} total):")
print("=" * 50)

# Print just the distinct pub names
for pub_name in distinct_pub_names_without_images:
    print(pub_name)

# Show example of sorted images for a deal with images
print("\n" + "=" * 50)
print("Example of sorted images for a deal with images:")
for deal in dictionary:
    if deal['image_files']:
        print(f"\nDeal: {deal['pub_name']}")
        print(f"Images ({len(deal['image_files'])} total):")
        for i, img in enumerate(deal['image_files'][:5]):  # Show first 5
            priority = get_image_priority(img)
            print(f"  Position {i}: {os.path.basename(img)} (Priority: {priority})")
        break

Total deals without images: 36
Distinct pub names without images (18 total):
Gassys Cardiff
Prince Of Teck Earls Court
Tank & Paddle Manchester Printworks
Temperance Fulham
The Bay & Bracket Victoria
The Block & Gasket Burgess Hill
The Block & Gasket Sale
The Boundary Reading
The Bridge Tap London
The Cannick Tapps London
The Centurion Colchester
The Cider Press Bristol
The Distillery Leicester
The Dockyard Portsmouth
The Faraday Epsom
The Garratt And Gauge Wimbledon
The Green Dragon Croydon
The Joiners London

Example of sorted images for a deal with images:
