In [None]:
import os
import torch

import numpy as np
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots


In [None]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [None]:
train_df = pd.read_parquet("data/joined_features_all.parquet")

In [None]:
train_df.sample(5)

In [None]:
landmark_counts = train_df['landmark_id'].value_counts()
landmark_counts.head()

In [None]:
unique_landmarks = train_df['landmark_id'].unique()
len(unique_landmarks)

There are 81313 unique landmarks in the dataset.

In [None]:
top25 = landmark_counts.head(25)
fig = px.bar(
    x=top25.index.astype(str),
    y=top25.values,
    labels={'x': 'Landmark ID', 'y': 'Count'},
    title='Top 25 Landmark IDs by Count: 138982 has 3 times as much as next highest landmark'
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()
fig.write_image('images/landmark_count.png')

The fact that landmark 138982 has 3 times as many images as the next highest landmark is concerning. We should keep an eye on that and may need to reduce the number of samples from that landmark id.

In [None]:
fig = px.histogram(
    train_df,
    x='aspect_ratio',
    nbins=20,
    labels={'aspect_ratio': 'Aspect Ratio', 'count': 'Count'},
    title='Aspect ratio (proportions) is similar for most images'
)
fig.update_layout(height=1000, width=1600)
fig.show()
fig.write_image('images/aspect_ratio_histogram.png')

In [None]:

fig = make_subplots(
    rows=1,
    cols=3,
    subplot_titles=("Mean R vs Mean G", "Mean G vs Mean B", "Mean R vs Mean B")
)

trace_rg = px.scatter(train_df, x='mean_r', y='mean_g').data[0]
trace_gb = px.scatter(train_df, x='mean_g', y='mean_b').data[0]
trace_rb = px.scatter(train_df, x='mean_r', y='mean_b').data[0]

fig.add_trace(trace_rg, row=1, col=1)
fig.add_trace(trace_gb, row=1, col=2)
fig.add_trace(trace_rb, row=1, col=3)

fig.update_xaxes(title_text="Mean R", row=1, col=1)
fig.update_yaxes(title_text="Mean G", row=1, col=1)
fig.update_xaxes(title_text="Mean G", row=1, col=2)
fig.update_yaxes(title_text="Mean B", row=1, col=2)
fig.update_xaxes(title_text="Mean R", row=1, col=3)
fig.update_yaxes(title_text="Mean B", row=1, col=3)

fig.update_layout(
    title_text="Pairwise relationships among Mean R, Mean G, and Mean B across all images. Looks noisy at first",
)

fig.show()
fig.write_image('images/color_channel_pairwise_all_landmarks.png', height=500, width=1200)


In [None]:
selected_ids = unique_landmarks[:5]
five_landmarks_df = train_df[train_df['landmark_id'].isin(selected_ids)].copy()


In [None]:
fig = make_subplots(
    rows=1,
    cols=3,
    subplot_titles=("Mean R vs Mean G", "Mean G vs Mean B", "Mean R vs Mean B")
)

trace_rg = px.scatter(five_landmarks_df, x='mean_r', y='mean_g', color='landmark_id').data[0]
trace_gb = px.scatter(five_landmarks_df, x='mean_g', y='mean_b', color='landmark_id').data[0]
trace_rb = px.scatter(five_landmarks_df, x='mean_r', y='mean_b', color='landmark_id').data[0]

fig.add_trace(trace_rg, row=1, col=1)
fig.add_trace(trace_gb, row=1, col=2)
fig.add_trace(trace_rb, row=1, col=3)

fig.update_xaxes(title_text="Mean R", row=1, col=1)
fig.update_yaxes(title_text="Mean G", row=1, col=1)
fig.update_xaxes(title_text="Mean G", row=1, col=2)
fig.update_yaxes(title_text="Mean B", row=1, col=2)
fig.update_xaxes(title_text="Mean R", row=1, col=3)
fig.update_yaxes(title_text="Mean B", row=1, col=3)

fig.update_layout(
    title_text="Color channels across 5 landmarks. Indicates color channels could be leverage for landmark identification",
)

fig.show()
fig.write_image('images/color_channel_pairwise_5_landmarks.png', height=500, width=1200)

In [None]:

fig = px.scatter_3d(
    five_landmarks_df,
    x='mean_r',
    y='mean_g',
    z='mean_b',
    color='landmark_id',
    title='3D Scatter of mean RGB for 5 Landmark IDs'
)
fig.update_layout(width=1200, height=900)
fig.show()
