In [None]:
import torch
import clip
from PIL import Image
import numpy as np
import os
import pandas as pd
from transformers import BlipProcessor, BlipForConditionalGeneration
import shutil

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
#model, preprocess = clip.load("ViT-B/16", device=device)
model, preprocess = clip.load("ViT-L/14@336px", device=device)

In [None]:
def generate_clip_embeddings(df, root_dir = "/mnt/e/Google_Photos/InnoJam_Photos"):
    """
    Generate CLIP embeddings for photos listed in a DataFrame.
    
    Args:
        df (pd.DataFrame): must contain 'photo_name' column.
        folder_path (str): folder where photos are stored.
    
    Returns:
        pd.DataFrame: original df + 'clip_embedding' column (list of floats).
    """
    embeddings = []
    
    for _, row in df.iterrows():
        photo_file = os.path.join(root_dir, row['photo_name'])
        
        if not os.path.exists(photo_file):
            print(f"⚠️ File not found: {photo_file}")
            embeddings.append(None)
            continue
        
        try:
            image = preprocess(Image.open(photo_file)).unsqueeze(0).to(device)
            with torch.no_grad():
                image_features = model.encode_image(image)
                # Normalize to unit vector
                image_features /= image_features.norm(dim=-1, keepdim=True)
                embeddings.append(image_features.cpu().numpy().tolist()[0])
        except Exception as e:
            print(f"❌ Error processing {photo_file}: {e}")
            embeddings.append(None)

    df["clip_embedding"] = embeddings
    return df


def build_metadata_text(row):
    parts = []

    # location
    if "city" in row and pd.notna(row["city"]):
        parts.append(f"in {row['city']}")
    if "country" in row and pd.notna(row["country"]):
        parts.append(f"in {row['country']}")

    # people (list or single string)
    if "names_list" in row and pd.notna(row["names_list"]):
        people_val = row["names_list"]
        if isinstance(people_val, list):  # already a Python list
            if len(people_val) > 0:
                if len(people_val) == 1:
                    parts.append(f"with {people_val[0]}")
                else:
                    people_str = ", ".join(people_val[:-1]) + " and " + people_val[-1]
                    parts.append(f"with {people_str}")
            else:
                parts.append("")
        else:
            # could be string like "['Alice','Bob']" → try to parse
            import ast
            try:
                parsed = ast.literal_eval(people_val)
                if isinstance(parsed, list) and len(parsed) > 0:
                    if isinstance(parsed, list) and parsed:
                        if len(parsed) == 1:
                            parts.append(f"with {parsed[0]}")
                        else:
                            people_str = ", ".join(parsed[:-1]) + " and " + parsed[-1]
                            parts.append(f"with {people_str}")
                    else:
                        parts.append(f"with {people_val}")
            except Exception:
                parts.append(f"with {people_val}")

    return "Photo " + " ".join(parts) if parts else "Photo"

In [4]:
df_in = pd.read_excel("../data/google_photos_metadata_with_location.xlsx")

In [None]:
# generate clip image and metadata embeddings

df_2 = generate_clip_embeddings(df_in)
df_2["metadata_text"] = df_2.apply(build_metadata_text, axis=1)
texts = df_2["metadata_text"].tolist()

# Tokenize and encode in batches to avoid GPU memory issues
batch_size = 64
all_embeddings = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    text_tokens = clip.tokenize(batch_texts).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)  # normalize
        all_embeddings.append(text_features.cpu().numpy())

# Stack into single numpy array
metadata_embeddings = np.vstack(all_embeddings)

# Add back to DataFrame
df_2["metadata_embedding"] = list(metadata_embeddings)

metadata_embeddings = np.vstack(df_2["metadata_embedding"].to_numpy())
image_embeddings = np.vstack(df_2["clip_embedding"].to_numpy())

np.save("../data/metadata_embeddings_l14_336.npy", metadata_embeddings)
np.save("../data/image_embeddings_l14_336.npy", image_embeddings)