In [None]:
import os
import torch

import numpy as np
import pandas as pd
import plotly.express as px

from PIL import Image
from sklearn.manifold import TSNE
from torchvision.models import resnet50
from torchvision.transforms import transforms

In [None]:
train_df = pd.read_csv("data/train.csv")

In [None]:
landmark_counts = train_df['landmark_id'].value_counts()
print(landmark_counts.head())

In [None]:
def extract_image_stats(image_path):
    img = Image.open(image_path)
    width, height = img.size
    aspect_ratio = width / height
    colors = np.array(img).reshape(-1, 3).mean(axis=0)  # Mean RGB
    return width, height, aspect_ratio, colors


In [None]:
image_dir = 'data/train/0/0/0/'
image_stats = []
for img_name in os.listdir(image_dir)[:100]:  # Limit for demo
    stats = extract_image_stats(os.path.join(image_dir, img_name))
    image_stats.append(stats)

In [None]:
image_stats

In [None]:
def get_embedding(image_path):
    img = Image.open(image_path).convert('RGB')
    img_t = transform(img).unsqueeze(0)
    with torch.no_grad():
        embedding = model(img_t).numpy().flatten()
    return embedding

In [None]:
model = resnet50(pretrained=True).eval()
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

embeddings = [get_embedding(os.path.join(image_dir, img_name)) for img_name in os.listdir(image_dir)[:100]]

In [None]:
embeddings

In [None]:
fig = px.histogram(landmark_counts, x=landmark_counts.index.astype(str), y='count', nbins=50)
fig.show()

In [None]:
top15 = landmark_counts.head(15)
fig = px.bar(top15, x=top15.index.astype(str), y='count')
fig.show()

In [None]:
stats_df = pd.DataFrame(image_stats, columns=['width', 'height', 'aspect_ratio', 'mean_rgb'])
stats_df['mean_r'] = stats_df['mean_rgb'].apply(lambda x: x[0])
stats_df['mean_g'] = stats_df['mean_rgb'].apply(lambda x: x[1])
stats_df['mean_b'] = stats_df['mean_rgb'].apply(lambda x: x[2])


In [None]:
fig = px.histogram(stats_df, x='aspect_ratio', nbins=50)
fig.show()

In [None]:
fig = px.scatter(stats_df, x='mean_r', y='mean_g')
fig.show()


In [None]:
stats_df

In [None]:
embeddings = np.array(embeddings)
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

fig = px.scatter(embeddings_2d, x=0, y=1)
fig.show()

In [None]:
embeddings_2d