In [1]:
import os
import torch

import numpy as np
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots


In [2]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [3]:
train_df = pd.read_parquet("data/joined_features_all.parquet")

In [4]:
train_df.sample(5)

Unnamed: 0,id,landmark_id,image_path,width,height,aspect_ratio,mean_rgb,mean_r,mean_g,mean_b,x,y,local_binary_pattern
18655,e0157088fb7e6989,2415,e0157088fb7e6989.jpg,800,301,2.657807,[161.64229651 163.84006645 155.58647841],161.642297,163.840066,155.586478,20.792238,68.182381,"[0.04472176079734219,0.02489202657807309,0.018..."
129511,d7baf360d7a81268,16925,d7baf360d7a81268.jpg,800,535,1.495327,[109.23122897 128.75526636 151.6790771 ],109.231229,128.755266,151.679077,-46.011292,-54.657196,"[0.036299065420560744,0.025196261682242992,0.0..."
1428711,8de40209c7bc0743,183574,8de40209c7bc0743.jpg,800,531,1.506591,[155.39600753 161.0506662 150.21396657],155.396008,161.050666,150.213967,-26.586796,72.499077,"[0.03875941619585688,0.025143596986817326,0.01..."
153289,292cbcdb551fb090,20064,292cbcdb551fb090.jpg,533,800,0.66625,[119.50498827 127.30897983 118.21045028],119.504988,127.30898,118.21045,-62.145607,27.255575,"[0.060046904315197,0.031772983114446526,0.0237..."
287909,4fc840158e2d87f6,36748,4fc840158e2d87f6.jpg,800,600,1.333333,[121.15250833 122.26193333 116.68889375],121.152508,122.261933,116.688894,53.060154,47.026871,"[0.04049375,0.02191041666666667,0.019291666666..."


In [13]:
# Make local_binary_pattern entries numpy ndarrays
def to_ndarray(v):
    s = v.strip().strip('[]')
    parts = s.replace(',', ' ').split()
    return [float(x) for x in parts]

In [14]:
train_df['local_binary_pattern'] = train_df['local_binary_pattern'].apply(to_ndarray)

In [6]:
landmark_counts = train_df['landmark_id'].value_counts()
landmark_counts.head()

landmark_id
138982    6272
126637    2231
20409     1758
83144     1741
113209    1135
Name: count, dtype: int64

In [7]:
unique_landmarks = train_df['landmark_id'].unique()
len(unique_landmarks)

81313

There are 81313 unique landmarks in the dataset.

In [None]:
top25 = landmark_counts.head(25)
fig = px.bar(
    x=top25.index.astype(str),
    y=top25.values,
    labels={'x': 'Landmark ID', 'y': 'Count'},
    title='Top 25 Landmark IDs by Count: 138982 has 3 times as much as next highest landmark'
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()
fig.write_image('images/landmark_count.png')

The fact that landmark 138982 has 3 times as many images as the next highest landmark is concerning. We should keep an eye on that and may need to reduce the number of samples from that landmark id.

In [None]:
fig = px.histogram(
    train_df,
    x='aspect_ratio',
    nbins=20,
    labels={'aspect_ratio': 'Aspect Ratio', 'count': 'Count'},
    title='Aspect ratio (proportions) is similar for most images'
)
fig.update_layout(height=1000, width=1600)
fig.show()
fig.write_image('images/aspect_ratio_histogram.png')

In [None]:

fig = make_subplots(
    rows=1,
    cols=3,
    subplot_titles=("Mean R vs Mean G", "Mean G vs Mean B", "Mean R vs Mean B")
)

trace_rg = px.scatter(train_df, x='mean_r', y='mean_g').data[0]
trace_gb = px.scatter(train_df, x='mean_g', y='mean_b').data[0]
trace_rb = px.scatter(train_df, x='mean_r', y='mean_b').data[0]

fig.add_trace(trace_rg, row=1, col=1)
fig.add_trace(trace_gb, row=1, col=2)
fig.add_trace(trace_rb, row=1, col=3)

fig.update_xaxes(title_text="Mean R", row=1, col=1)
fig.update_yaxes(title_text="Mean G", row=1, col=1)
fig.update_xaxes(title_text="Mean G", row=1, col=2)
fig.update_yaxes(title_text="Mean B", row=1, col=2)
fig.update_xaxes(title_text="Mean R", row=1, col=3)
fig.update_yaxes(title_text="Mean B", row=1, col=3)

fig.update_layout(
    title_text="Pairwise relationships among Mean R, Mean G, and Mean B across all images. Looks noisy at first",
)

fig.show()
fig.write_image('images/color_channel_pairwise_all_landmarks.png', height=500, width=1200)


In [15]:
selected_ids = unique_landmarks[:5]
five_landmarks_df = train_df[train_df['landmark_id'].isin(selected_ids)].copy()


In [None]:
fig = make_subplots(
    rows=1,
    cols=3,
    subplot_titles=("Mean R vs Mean G", "Mean G vs Mean B", "Mean R vs Mean B")
)

trace_rg = px.scatter(five_landmarks_df, x='mean_r', y='mean_g', color='landmark_id').data[0]
trace_gb = px.scatter(five_landmarks_df, x='mean_g', y='mean_b', color='landmark_id').data[0]
trace_rb = px.scatter(five_landmarks_df, x='mean_r', y='mean_b', color='landmark_id').data[0]

fig.add_trace(trace_rg, row=1, col=1)
fig.add_trace(trace_gb, row=1, col=2)
fig.add_trace(trace_rb, row=1, col=3)

fig.update_xaxes(title_text="Mean R", row=1, col=1)
fig.update_yaxes(title_text="Mean G", row=1, col=1)
fig.update_xaxes(title_text="Mean G", row=1, col=2)
fig.update_yaxes(title_text="Mean B", row=1, col=2)
fig.update_xaxes(title_text="Mean R", row=1, col=3)
fig.update_yaxes(title_text="Mean B", row=1, col=3)

fig.update_layout(
    title_text="Color channels across 5 landmarks. Indicates color channels could be leverage for landmark identification",
)

fig.show()
fig.write_image('images/color_channel_pairwise_5_landmarks.png', height=500, width=1200)

In [None]:

fig = px.scatter_3d(
    five_landmarks_df,
    x='mean_r',
    y='mean_g',
    z='mean_b',
    color='landmark_id',
    title='3D Scatter of mean RGB for 5 Landmark IDs'
)
fig.update_layout(width=1200, height=900)
fig.show()


In [16]:
def mean_histogram(arrs):
    stacked = np.vstack(arrs)
    return stacked.mean(axis=0)

In [21]:
# Plot the Mean Local Binary Pattern histogram
lbp_df = five_landmarks_df.dropna(subset=['local_binary_pattern']).copy()

first_hist = lbp_df.iloc[0]['local_binary_pattern']
n_bins = len(first_hist)
# Get histogram mean grouped by landmark
mean_hists = lbp_df.groupby('landmark_id', observed=True)['local_binary_pattern'].apply(mean_histogram)

records = []
for landmark_id, hist in mean_hists.items():
    for i, v in enumerate(hist):
        records.append({'landmark_id': landmark_id, 'bin': i, 'value': float(v)})
plot_df = pd.DataFrame(records)

fig = px.line(
    plot_df,
    x='bin',
    y='value',
    color='landmark_id',
    markers=True,
    labels={'bin': 'Local Binary Pattern Bin', 'value': 'Mean Frequency', 'landmark_id': 'Landmark ID'},
    title='Mean Local Binary Pattern Histogram per Landmark. Bin 12 & 13 shows differences texture'
)
fig.update_layout(width=1200, height=700)
fig.show()
fig.write_image('images/lbp_mean_histogram_5_landmarks.png')


Local Binary Patterns is a texture descriptor. It does this by comparing a pixel to its neighboring pixels. It captures the intensity of each pixel and compares their intensities. It will assign 1 if the neighboring pixel's intensity is greater than the pixel currently being assessed. Otherwise, LBP will assign 0. The algorithm then combines the binary values of all the neighboring pixels to create a value for the pixel being assessed. It does this for all the pixels in the image to create a binary code representing the texture of the image.