In [None]:
import torch

import numpy as np
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots


In [None]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [None]:
train_df = pd.read_parquet("data/joined_features_all.parquet")

In [None]:
train_df.sample(5)

In [None]:
# Make local_binary_pattern entries numpy ndarrays
def to_ndarray(v):
    s = v.strip().strip('[]')
    parts = s.replace(',', ' ').split()
    return [float(x) for x in parts]

In [None]:
train_df['local_binary_pattern'] = train_df['local_binary_pattern'].apply(to_ndarray)

In [None]:
train_df['landmark_id'] = train_df['landmark_id'].astype(str)

In [None]:
landmark_counts = train_df['landmark_id'].value_counts()
landmark_counts.head()

In [None]:
unique_landmarks = train_df['landmark_id'].unique()
len(unique_landmarks)

There are 81313 unique landmarks in the dataset.

In [None]:
top25 = landmark_counts.head(25)
fig = px.bar(
    x=top25.index.astype(str),
    y=top25.values,
    labels={'x': 'Landmark ID', 'y': 'Count'},
    title='Top 25 Landmark IDs by Count: 138982 has 3 times as much as next highest landmark'
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()
fig.write_image('images/landmark_count.png')

The fact that landmark 138982 has 3 times as many images as the next highest landmark is concerning. We should keep an eye on that and may need to reduce the number of samples from that landmark id.

In [None]:
fig = px.histogram(
    train_df,
    x='aspect_ratio',
    nbins=20,
    labels={'aspect_ratio': 'Aspect Ratio', 'count': 'Count'},
    title='Aspect ratio (proportions) is similar for most images'
)
fig.update_layout(height=500, width=1200)
fig.show()
fig.write_image('images/aspect_ratio_histogram.png', height=500, width=1200)

In [None]:

fig = make_subplots(
    rows=1,
    cols=3,
    subplot_titles=("Mean R vs Mean G", "Mean G vs Mean B", "Mean R vs Mean B")
)

trace_rg = px.scatter(train_df, x='mean_r', y='mean_g').data[0]
trace_gb = px.scatter(train_df, x='mean_g', y='mean_b').data[0]
trace_rb = px.scatter(train_df, x='mean_r', y='mean_b').data[0]

fig.add_trace(trace_rg, row=1, col=1)
fig.add_trace(trace_gb, row=1, col=2)
fig.add_trace(trace_rb, row=1, col=3)

fig.update_xaxes(title_text="Mean R", row=1, col=1)
fig.update_yaxes(title_text="Mean G", row=1, col=1)
fig.update_xaxes(title_text="Mean G", row=1, col=2)
fig.update_yaxes(title_text="Mean B", row=1, col=2)
fig.update_xaxes(title_text="Mean R", row=1, col=3)
fig.update_yaxes(title_text="Mean B", row=1, col=3)

fig.update_layout(
    title_text="Pairwise relationships among Mean R, Mean G, and Mean B across all images. Looks noisy at first",
)

fig.show()
fig.write_image('images/color_channel_pairwise_all_landmarks.png', height=500, width=1200)


In [None]:
selected_ids = unique_landmarks[:5]
five_landmarks_df = train_df[train_df['landmark_id'].isin(selected_ids)].copy()

In [None]:
fig = make_subplots(
    rows=1,
    cols=3,
    subplot_titles=("Mean R vs Mean G", "Mean G vs Mean B", "Mean R vs Mean B")
)

fig_rg = px.scatter(five_landmarks_df, x='mean_r', y='mean_g', color='landmark_id')
for tr in fig_rg.data:
    fig.add_trace(tr, row=1, col=1)

fig_gb = px.scatter(five_landmarks_df, x='mean_g', y='mean_b', color='landmark_id')
for tr in fig_gb.data:
    tr.showlegend = False
    fig.add_trace(tr, row=1, col=2)

fig_rb = px.scatter(five_landmarks_df, x='mean_r', y='mean_b', color='landmark_id')
for tr in fig_rb.data:
    tr.showlegend = False
    fig.add_trace(tr, row=1, col=3)

fig.update_xaxes(title_text="Mean R", row=1, col=1)
fig.update_yaxes(title_text="Mean G", row=1, col=1)
fig.update_xaxes(title_text="Mean G", row=1, col=2)
fig.update_yaxes(title_text="Mean B", row=1, col=2)
fig.update_xaxes(title_text="Mean R", row=1, col=3)
fig.update_yaxes(title_text="Mean B", row=1, col=3)

fig.update_layout(
    title_text="Color channels across 5 landmarks. Indicates color channels could be leverage for landmark identification",
)

fig.show()
fig.write_image('images/color_channel_pairwise_5_landmarks.png', height=500, width=1200)


In [None]:

fig = px.scatter_3d(
    five_landmarks_df,
    x='mean_r',
    y='mean_g',
    z='mean_b',
    color='landmark_id',
    title='3D Scatter of mean RGB for 5 Landmark IDs'
)
fig.update_layout(width=1200, height=900)
fig.show()


In [None]:
def mean_histogram(arrs):
    stacked = np.vstack(arrs)
    return stacked.mean(axis=0)

In [None]:
# Plot the Mean Local Binary Pattern histogram
lbp_df = five_landmarks_df.dropna(subset=['local_binary_pattern']).copy()

first_hist = lbp_df.iloc[0]['local_binary_pattern']
n_bins = len(first_hist)
# Get histogram mean grouped by landmark
mean_hists = lbp_df.groupby('landmark_id', observed=True)['local_binary_pattern'].apply(mean_histogram)

records = []
for landmark_id, hist in mean_hists.items():
    for i, v in enumerate(hist):
        records.append({'landmark_id': landmark_id, 'bin': i, 'value': float(v)})
plot_df = pd.DataFrame(records)

fig = px.line(
    plot_df,
    x='bin',
    y='value',
    color='landmark_id',
    markers=True,
    labels={'bin': 'Local Binary Pattern Bin', 'value': 'Mean Frequency', 'landmark_id': 'Landmark ID'},
    title='Mean Local Binary Pattern Histogram per Landmark. Bin 12 & 13 shows differences texture'
)
fig.update_layout(width=1200, height=700)
fig.show()
fig.write_image('images/lbp_mean_histogram_5_landmarks.png', width=1200, height=700)


Local Binary Patterns is a texture descriptor. It does this by comparing a pixel to its neighboring pixels. It captures the intensity of each pixel and compares their intensities. It will assign 1 if the neighboring pixel's intensity is greater than the pixel currently being assessed. Otherwise, LBP will assign 0. The algorithm then combines the binary values of all the neighboring pixels to create a value for the pixel being assessed. It does this for all the pixels in the image to create a binary code representing the texture of the image.

In [None]:
fig = px.scatter(
    five_landmarks_df,
    x='x',
    y='y',
    color='landmark_id',
    labels={'x': 'Embedding 2d X', 'y': 'Embedding 2d Y', 'landmark_id': 'Landmark ID'},
    title='2d Embedding shows promise in clustering images by embeddings'
)
fig.update_layout(width=1200, height=700)
fig.show()
fig.write_image('images/embedding_2d_scatter_5_landmarks.png', width=1200, height=700)

In [None]:
fig = px.scatter(
    train_df[train_df['landmark_id'].isin(unique_landmarks[:25])],
    x='x',
    y='y',
    color='landmark_id',
    labels={'x': 'Embedding 2d X', 'y': 'Embedding 2d Y', 'landmark_id': 'Landmark ID'},
    title='2d Embedding shows promise in clustering images by embeddings 25 landmarks'
)
fig.update_layout(width=1200, height=700)
fig.show()
fig.write_image('images/embedding_2d_scatter_25_landmarks.png', width=1200, height=700)