In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics.pairwise import cosine_similarity
import requests
from PIL import Image
from io import BytesIO
from torchvision import models, transforms
import torch
import ast
import aiohttp
import asyncio
from tqdm.asyncio import tqdm as tqdm_asyncio
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
tqdm.pandas()

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [4]:
maxmara = pd.read_csv('maxmara_cleaned.csv')
netaporter = pd.read_csv('netaporter_cleaned.csv')
luisaviaroma = pd.read_csv('luisaviaroma_cleaned.csv')

In [9]:
# Initialize model for multi-GPU
def initialize_model():
    model = models.resnet50(pretrained=True)
    #model.fc = torch.nn.Identity()  # Remove classification layer
    model.eval()
    # Wrap model in DataParallel to utilize all GPUs
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
    model = model.to(device)  # Move model to GPU(s)
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    return model, transform

model, transform = initialize_model()

# Download an image from a URL
def download_image(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            return Image.open(BytesIO(response.content)).convert("RGB")
    except Exception as e:
        print(f"Failed to download image: {url}, error: {e}")
    return None

async def fetch_image(session, url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        async with session.get(url, timeout=10) as response:
            if response.status == 200:
                content = await response.read()
                return Image.open(BytesIO(content)).convert("RGB")
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
    return None

async def download_images_async(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_image(session, url) for url in urls]
        images = await asyncio.gather(*tasks)
    return [img for img in images if img is not None]

# Use this wrapper to call the async function synchronously
def download_images_parallel(urls):
    return asyncio.run(download_images_async(urls))

def batch_feature_extraction(images, batch_size=32):
    batched_features = []
    for i in range(0, len(images), batch_size):
        batch = images[i:i + batch_size]
        image_tensors = [transform(image).unsqueeze(0) for image in batch]
        image_tensors = torch.cat(image_tensors).to(device)
        with torch.no_grad():
            features = model(image_tensors)  # Utilize multi-GPU here
        batched_features.append(features.cpu().numpy())
    return np.vstack(batched_features)

# Parallelized image downloading
def download_images_parallel(urls):
    with ThreadPoolExecutor() as executor:
        images = list(executor.map(download_image, urls))
    return [img for img in images if img is not None]

# Compute image similarity scores
def compute_similarity(maxmara_images, netaporter_images):
    maxmara_features = batch_feature_extraction(maxmara_images)
    netaporter_features = batch_feature_extraction(netaporter_images)
    scores = cosine_similarity(maxmara_features, netaporter_features)
    return scores

def compute_similarity_in_chunks(matrix_a, matrix_b, chunk_size=1000):
    results = []
    for i in range(0, matrix_a.shape[0], chunk_size):
        chunk_a = matrix_a[i:i + chunk_size]
        similarity = cosine_similarity(chunk_a, matrix_b)
        results.append(similarity)
    return np.vstack(results)

def compute_similarity_in_chunks(matrix_a, matrix_b, chunk_size=1000):
    results = []
    for i in tqdm(range(0, matrix_a.shape[0], chunk_size), desc="Computing similarity"):
        chunk_a = matrix_a[i:i + chunk_size]
        similarity = cosine_similarity(chunk_a, matrix_b)
        results.append(similarity)
    return np.vstack(results)



In [8]:
print('max mara')
maxmara['features'] = maxmara['images'].progress_apply(
    lambda x: batch_feature_extraction(download_images_parallel(ast.literal_eval(x)))
)
print('net a porter')
netaporter['features'] = netaporter['images'].progress_apply(
    lambda x: batch_feature_extraction(download_images_parallel(ast.literal_eval(x)))
)
print('luisa via roma')
luisaviaroma['features'] = luisaviaroma['images'].progress_apply(
    lambda x: batch_feature_extraction(download_images_parallel(ast.literal_eval(x)))
)

max mara


100%|██████████████████████████████████████████████████████████████████████████████| 2510/2510 [15:10<00:00,  2.76it/s]


net a porter


 61%|█████████████████████████████████████████████████▏                              | 234/381 [01:41<01:03,  2.30it/s]


KeyboardInterrupt: 

In [None]:
maxmara.to_csv('maxmara_featured.csv',index=False)
netaporter.to_csv('netaporter_featured.csv',index=False)
luisaviaroma.to_csv('luisaviaroma_featured.csv',index=False)

In [11]:
maxmara_features = np.vstack(maxmara['features'].tolist())
netaporter_features = np.vstack(netaporter['features'].tolist())
luisaviaroma_features = np.vstack(luisaviaroma['features'].tolist())

In [12]:
similarity_matrix_neta = cosine_similarity(maxmara_features, netaporter_features)
similarity_matrix_luisa = cosine_similarity(maxmara_features, luisaviaroma_features)

In [27]:
similarity_results = []

for i, maxmara_row in tqdm(maxmara.iterrows()):
    for j, netaporter_row in netaporter.iterrows():
        avg_score = similarity_matrix_neta[i][j]
        similarity_results.append({
            'maxmara_index': i,
            'retailer_index': j,
            'retailer_name': 'netaporter',
            'image_similarity_score': avg_score
        })

similarity_df_neta = pd.DataFrame(similarity_results)

2510it [00:33, 74.02it/s]


In [28]:
similarity_results = []

for i, maxmara_row in tqdm(maxmara.iterrows()):
    for j, netaporter_row in luisaviaroma.iterrows():
        avg_score = similarity_matrix_luisa[i][j]
        similarity_results.append({
            'maxmara_index': i,
            'retailer_index': j,
            'retailer_name': 'luisaviaroma',
            'image_similarity_score': avg_score
        })

similarity_df_luisa = pd.DataFrame(similarity_results)

2510it [01:11, 35.12it/s]


In [29]:
image_similarity_score = pd.concat([similarity_df_neta, similarity_df_luisa], ignore_index=True)

In [31]:
image_similarity_score.to_csv('image_similarity_score.csv',index=False)