In [1]:
import os
import torch

import numpy as np
import pandas as pd

from pathlib import Path
from PIL import Image
from sklearn.manifold import TSNE
from torchvision.models import resnet50, ResNet50_Weights
from skimage.feature import local_binary_pattern
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed


In [2]:

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [3]:
train_df = pd.read_csv("data/train.csv")

In [4]:
landmark_counts = train_df['landmark_id'].value_counts()
print(landmark_counts.head())

landmark_id
138982    6272
126637    2231
20409     1758
83144     1741
113209    1135
Name: count, dtype: int64


In [5]:
def get_all_image_paths(location):
    return list(Path(f"data/{location}").rglob("*.[jJ][pP][gG]"))

In [6]:
train_images = get_all_image_paths('train')

In [7]:
len(train_images)

1580470

In [6]:
def extract_image_stats(image_path):
    img = Image.open(image_path)
    width, height = img.size
    aspect_ratio = width / height
    colors = np.array(img).reshape(-1, 3).mean(axis=0)  # Mean RGB
    return os.path.basename(image_path), width, height, aspect_ratio, str(colors), colors[0], colors[1], colors[2]


In [7]:
def image_stats_to_parquet(images, parquet_name):
    image_stats = []
    for img_path in images:
        stats = extract_image_stats(img_path)
        image_stats.append(stats)
    image_stats_df = pd.DataFrame(image_stats, columns=['image_path', 'width', 'height', 'aspect_ratio', 'mean_rgb', 'mean_r', 'mean_g', 'mean_b'])
    image_stats_df.to_parquet(f'data/{parquet_name}.parquet')


In [13]:
CHUNK_SIZE = 100_000

total_images = len(train_images)
num_chunks = (total_images + CHUNK_SIZE - 1) // CHUNK_SIZE

In [22]:

for idx in range(num_chunks):
    start = idx * CHUNK_SIZE
    end = min(start + CHUNK_SIZE, total_images)
    chunk = train_images[start:end]
    parquet_name = f"image_stats_{idx}"
    print(f"Processing chunk {idx + 1}/{num_chunks}: images {start} to {end - 1} -> {parquet_name}.parquet")
    image_stats_to_parquet(chunk, parquet_name)


Processing chunk 1/16: images 0 to 99999 -> image_stats_0.parquet
Processing chunk 2/16: images 100000 to 199999 -> image_stats_1.parquet
Processing chunk 3/16: images 200000 to 299999 -> image_stats_2.parquet
Processing chunk 4/16: images 300000 to 399999 -> image_stats_3.parquet
Processing chunk 5/16: images 400000 to 499999 -> image_stats_4.parquet
Processing chunk 6/16: images 500000 to 599999 -> image_stats_5.parquet
Processing chunk 7/16: images 600000 to 699999 -> image_stats_6.parquet
Processing chunk 8/16: images 700000 to 799999 -> image_stats_7.parquet
Processing chunk 9/16: images 800000 to 899999 -> image_stats_8.parquet
Processing chunk 10/16: images 900000 to 999999 -> image_stats_9.parquet
Processing chunk 11/16: images 1000000 to 1099999 -> image_stats_10.parquet
Processing chunk 12/16: images 1100000 to 1199999 -> image_stats_11.parquet
Processing chunk 13/16: images 1200000 to 1299999 -> image_stats_12.parquet
Processing chunk 14/16: images 1300000 to 1399999 -> imag

In [27]:
def get_all_parquets():
    return list(Path(f"data").rglob("*.parquet"))

def parse_rgb(v):
    if isinstance(v, (list, tuple, np.ndarray)):
        return [float(x) for x in v]
    if isinstance(v, str):
        s = v.strip().strip('[]')
        parts = s.replace(',', ' ').split()
        try:
            return [float(x) for x in parts]
        except ValueError:
            return [np.nan, np.nan, np.nan]
    return [np.nan, np.nan, np.nan]

def update_parquets():
    for p in get_all_parquets():
        df = pd.read_parquet(p)
        rgb_list = df['mean_rgb'].apply(parse_rgb)
        df['mean_r'] = rgb_list.str[0]
        df['mean_g'] = rgb_list.str[1]
        df['mean_b'] = rgb_list.str[2]
        df.to_parquet(p)
update_parquets()

In [29]:
df = pd.read_parquet('data/image_stats_1.parquet')
df

Unnamed: 0,image_path,width,height,aspect_ratio,mean_rgb,mean_r,mean_g,mean_b
0,10390cdc0658ef06.jpg,640,480,1.333333,[133.27492187 135.35043294 139.50327148],133.274922,135.350433,139.503271
1,10390e9f4cb822b1.jpg,800,533,1.500938,[66.7822303 64.66316604 58.24064493],66.782230,64.663166,58.240645
2,103911f72a2844bb.jpg,533,800,0.666250,[121.39125469 124.84773218 123.14964353],121.391255,124.847732,123.149644
3,10392045d063f550.jpg,800,550,1.454545,[127.37159773 114.27438636 98.03523182],127.371598,114.274386,98.035232
4,103924608827b51e.jpg,800,533,1.500938,[144.08233583 142.94427533 143.44078565],144.082336,142.944275,143.440786
...,...,...,...,...,...,...,...,...
99995,204676756dd16b0d.jpg,800,600,1.333333,[153.03366042 148.52583333 141.40353958],153.033660,148.525833,141.403540
99996,2046787128214206.jpg,800,600,1.333333,[123.47010625 125.82466667 114.92535833],123.470106,125.824667,114.925358
99997,20467dfe7d946ed3.jpg,800,531,1.506591,[123.2754096 113.42888653 101.74648776],123.275410,113.428887,101.746488
99998,20468398e0c989f5.jpg,798,800,0.997500,[137.24472901 137.24472901 137.24472901],137.244729,137.244729,137.244729


In [9]:
def get_embedding(image_path, model, transform):
    img = Image.open(image_path).convert('RGB')
    img_t = transform(img).unsqueeze(0)
    with torch.no_grad():
        embedding = model(img_t).numpy().flatten()
    return os.path.basename(image_path), embedding.tolist()

In [10]:
def embedding_to_parquet(images, parquet_name):
    embeddings_chunk = [get_embedding(p, resnet, resnet_transform) for p in images]
    emb_df = pd.DataFrame(embeddings_chunk, columns=['image_path', 'embedding'])
    emb_df.to_parquet(f"data/{parquet_name}.parquet")

In [76]:
weights = ResNet50_Weights.DEFAULT
resnet = resnet50(weights=weights).eval()
resnet_transform = weights.transforms()
for idx in range(num_chunks):
    start = idx * CHUNK_SIZE
    end = min(start + CHUNK_SIZE, total_images)
    chunk_paths = train_images[start:end]
    print(f"Processing embeddings chunk {idx + 1}/{num_chunks}: images {start} to {end - 1}")
    embedding_to_parquet(chunk_paths, f'embeddings_train_{idx}')

Processing embeddings chunk 1/16: images 0 to 99999
Processing embeddings chunk 2/16: images 100000 to 199999
Processing embeddings chunk 3/16: images 200000 to 299999
Processing embeddings chunk 4/16: images 300000 to 399999
Processing embeddings chunk 5/16: images 400000 to 499999
Processing embeddings chunk 6/16: images 500000 to 599999
Processing embeddings chunk 7/16: images 600000 to 699999
Processing embeddings chunk 8/16: images 700000 to 799999
Processing embeddings chunk 9/16: images 800000 to 899999
Processing embeddings chunk 10/16: images 900000 to 999999
Processing embeddings chunk 11/16: images 1000000 to 1099999
Processing embeddings chunk 12/16: images 1100000 to 1199999
Processing embeddings chunk 13/16: images 1200000 to 1299999
Processing embeddings chunk 14/16: images 1300000 to 1399999
Processing embeddings chunk 15/16: images 1400000 to 1499999
Processing embeddings chunk 16/16: images 1500000 to 1580469


In [8]:
def get_embedding_parquets():
    return [p for p in Path("data").rglob("embeddings_*.parquet") if "2d" not in p.name]

def get_embedding_parquets_2d():
    return list(Path(f"data").rglob("embeddings_*_2d.parquet"))

def get_image_stats_parquets():
    return list(Path(f"data").rglob("image_*.parquet"))

def get_local_binary_parquets():
    return list(Path(f"data").rglob("local_binary_path_*.parquet"))


In [14]:
get_embedding_parquets()

[PosixPath('data/embeddings_train_0.parquet'),
 PosixPath('data/embeddings_train_1.parquet'),
 PosixPath('data/embeddings_train_10.parquet'),
 PosixPath('data/embeddings_train_11.parquet'),
 PosixPath('data/embeddings_train_12.parquet'),
 PosixPath('data/embeddings_train_13.parquet'),
 PosixPath('data/embeddings_train_14.parquet'),
 PosixPath('data/embeddings_train_15.parquet'),
 PosixPath('data/embeddings_train_2.parquet'),
 PosixPath('data/embeddings_train_3.parquet'),
 PosixPath('data/embeddings_train_4.parquet'),
 PosixPath('data/embeddings_train_5.parquet'),
 PosixPath('data/embeddings_train_6.parquet'),
 PosixPath('data/embeddings_train_7.parquet'),
 PosixPath('data/embeddings_train_8.parquet'),
 PosixPath('data/embeddings_train_9.parquet')]

In [None]:

def embedding_to_2d_parquet(e_file):
    embeddings = pd.read_parquet(e_file)
    # Compute 2D projection
    X = np.stack(embeddings['embedding'].map(np.asarray).to_list()).astype(np.float32)

    tsne = TSNE(n_components=2, random_state=42)
    embeddings_2d = tsne.fit_transform(X)

    # Create DataFrame and store to parquet
    embeddings_df = pd.DataFrame({
        'image_path': embeddings['image_path'],
        'x': embeddings_2d[:, 0],
        'y': embeddings_2d[:, 1],
    })
    e_name = os.path.basename(e_file)
    name = e_name.split('.')[0]
    embeddings_df.to_parquet(f'data/{name}_2d.parquet')


embedding_files = get_embedding_parquets()
for embedding_file in embedding_files:
    print(f"Processing embeddings file {os.path.basename(embedding_file)}")
    embedding_to_2d_parquet(embedding_file)

In [9]:
def extract_local_binary_patterns(image_path):
    img = Image.open(image_path).convert('L')  # Convert to grayscale
    img_array = np.array(img)
    n_points = 24
    radius = 3
    bins = 27
    lbp = local_binary_pattern(img_array, n_points, radius, method='uniform')
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, bins), density=True)
    return os.path.basename(image_path), lbp_hist.tolist()

In [10]:
def local_binary_pattern_to_parquet(chunk, parquet_name):
    with ProcessPoolExecutor() as executor:
        local_binary_pattern_chunk = list(executor.map(extract_local_binary_patterns, chunk))
    lbp_df = pd.DataFrame(local_binary_pattern_chunk, columns=['image_path', 'local_binary_pattern'])
    lbp_df.to_parquet(f"data/{parquet_name}.parquet")


In [11]:
extract_local_binary_patterns(train_images[0])

('0000059611c7d079.jpg',
 [0.039375,
  0.022866666666666667,
  0.019114583333333334,
  0.014377083333333334,
  0.011895833333333333,
  0.010091666666666667,
  0.0096,
  0.012060416666666667,
  0.013683333333333334,
  0.01654375,
  0.020560416666666668,
  0.03183958333333333,
  0.04687291666666667,
  0.0506625,
  0.020329166666666666,
  0.03127083333333333,
  0.015739583333333335,
  0.02439375,
  0.01446875,
  0.015054166666666667,
  0.01511875,
  0.018952083333333335,
  0.02148125,
  0.023475,
  0.10822916666666667,
  0.37194375])

In [None]:
def _process_lbp_chunk(args):
    idx, start, end, chunk, parquet_name = args
    print(f"Processing chunk {idx + 1}/{num_chunks}: images {start} to {end - 1} -> {parquet_name}.parquet")
    local_binary_pattern_to_parquet(chunk, parquet_name)
    return parquet_name

# chunks 2, 3 & 4 got stuck re-running
tasks = []
for idx in range(num_chunks):
    # if idx in [1, 2, 3]:
    start = idx * CHUNK_SIZE
    end = min(start + CHUNK_SIZE, total_images)
    chunk = train_images[start:end]
    parquet_name = f"local_binary_path_{idx}"
    tasks.append((idx, start, end, chunk, parquet_name))

max_workers = max(1, min(4, (os.cpu_count() or 1) // 2))
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(_process_lbp_chunk, t) for t in tasks]
    for f in as_completed(futures):
        _ = f.result()


In [5]:
image_stats_all = pd.concat((pd.read_parquet(p) for p in get_image_stats_parquets()), ignore_index=True)
image_stats_all.to_parquet("data/image_stats_all.parquet")
image_stats_all

Unnamed: 0,image_path,width,height,aspect_ratio,mean_rgb,mean_r,mean_g,mean_b
0,0000059611c7d079.jpg,600,800,0.750000,[112.01350833 113.36333542 118.7390375 ],112.013508,113.363335,118.739037
1,000014b1f770f640.jpg,800,600,1.333333,[124.51520625 143.97102917 133.13739167],124.515206,143.971029,133.137392
2,000015f76534add3.jpg,533,800,0.666250,[152.57681285 154.80336773 142.70662054],152.576813,154.803368,142.706621
3,00001ae42cd00356.jpg,800,600,1.333333,[127.971075 133.14455417 119.51298542],127.971075,133.144554,119.512985
4,00001b2ba2c69ac5.jpg,618,800,0.772500,[133.7146157 133.42644822 128.71319377],133.714616,133.426448,128.713194
...,...,...,...,...,...,...,...,...
1580465,a1fe146c857ff1ea.jpg,800,600,1.333333,[111.412025 115.14696458 115.33845625],111.412025,115.146965,115.338456
1580466,a1fe3596deae1af3.jpg,800,600,1.333333,[139.20082917 130.98304583 117.84341042],139.200829,130.983046,117.843410
1580467,a1fe5fa2df16b806.jpg,800,533,1.500938,[135.36139306 140.05324812 135.07628752],135.361393,140.053248,135.076288
1580468,a1fe64583dc6e604.jpg,800,533,1.500938,[131.79981707 134.62416745 136.21815432],131.799817,134.624167,136.218154


In [8]:
embeddings_2d_all = pd.concat((pd.read_parquet(p) for p in get_embedding_parquets_2d()), ignore_index=True)
embeddings_2d_all.to_parquet("data/embeddings_train_2d_all.parquet")
embeddings_2d_all

Unnamed: 0,image_path,x,y
0,0000059611c7d079.jpg,22.981199,-2.083232
1,000014b1f770f640.jpg,18.305468,7.833691
2,000015f76534add3.jpg,-63.447765,-31.691349
3,00001ae42cd00356.jpg,-79.348366,28.682961
4,00001b2ba2c69ac5.jpg,68.682686,32.287830
...,...,...,...
1580465,a1fe146c857ff1ea.jpg,-13.779037,0.381376
1580466,a1fe3596deae1af3.jpg,33.778980,51.062721
1580467,a1fe5fa2df16b806.jpg,-41.233215,10.429104
1580468,a1fe64583dc6e604.jpg,76.945580,-23.476965


In [16]:
local_binary_all = pd.concat((pd.read_parquet(p) for p in get_local_binary_parquets()), ignore_index=True)
local_binary_all.to_parquet("data/local_binary_all.parquet")
local_binary_all

Unnamed: 0,image_path,local_binary_pattern
0,0000059611c7d079.jpg,"[0.039375, 0.022866666666666667, 0.01911458333..."
1,000014b1f770f640.jpg,"[0.049104166666666664, 0.033716666666666666, 0..."
2,000015f76534add3.jpg,"[0.04549484052532833, 0.02898452157598499, 0.0..."
3,00001ae42cd00356.jpg,"[0.060414583333333334, 0.025908333333333332, 0..."
4,00001b2ba2c69ac5.jpg,"[0.03507281553398058, 0.025335760517799354, 0...."
...,...,...
1580465,a1fe146c857ff1ea.jpg,"[0.0484125, 0.029875, 0.0194625, 0.0122625, 0...."
1580466,a1fe3596deae1af3.jpg,"[0.04268541666666667, 0.021360416666666666, 0...."
1580467,a1fe5fa2df16b806.jpg,"[0.04695590994371482, 0.025154784240150094, 0...."
1580468,a1fe64583dc6e604.jpg,"[0.034676360225140715, 0.016892589118198874, 0..."


In [17]:
train_df['image_path'] = train_df['id'].astype(str) + '.jpg'
train_df

Unnamed: 0,id,landmark_id,image_path
0,17660ef415d37059,1,17660ef415d37059.jpg
1,92b6290d571448f6,1,92b6290d571448f6.jpg
2,cd41bf948edc0340,1,cd41bf948edc0340.jpg
3,fb09f1e98c6d2f70,1,fb09f1e98c6d2f70.jpg
4,25c9dfc7ea69838d,7,25c9dfc7ea69838d.jpg
...,...,...,...
1580465,72c3b1c367e3d559,203092,72c3b1c367e3d559.jpg
1580466,7a6a2d9ea92684a6,203092,7a6a2d9ea92684a6.jpg
1580467,9401fad4c497e1f9,203092,9401fad4c497e1f9.jpg
1580468,aacc960c9a228b5f,203092,aacc960c9a228b5f.jpg


In [19]:
# Join image_stats_all, embeddings_2d_all, local_binary_all by image_path
image_stats_all = pd.read_parquet('data/image_stats_all.parquet')
embeddings_2d_all = pd.read_parquet('data/embeddings_train_2d_all.parquet')
local_binary_all = pd.read_parquet('data/local_binary_all.parquet')
dfs = [
    train_df.drop_duplicates(subset=['image_path']),
    image_stats_all.drop_duplicates(subset=['image_path']),
    embeddings_2d_all.drop_duplicates(subset=['image_path']),
    local_binary_all.drop_duplicates(subset=['image_path'])
]

joined_df = dfs[0]
for df_part in dfs[1:]:
    joined_df = joined_df.merge(df_part, on='image_path', how='inner')

joined_df.to_parquet("data/joined_features_all.parquet")
joined_df


Unnamed: 0,id,landmark_id,image_path,width,height,aspect_ratio,mean_rgb,mean_r,mean_g,mean_b,x,y,local_binary_pattern
0,17660ef415d37059,1,17660ef415d37059.jpg,533,800,0.666250,[126.00759381 119.0244606 113.26428471],126.007594,119.024461,113.264285,6.526572,-59.742741,"[0.05229596622889306, 0.027049718574108816, 0...."
1,92b6290d571448f6,1,92b6290d571448f6.jpg,534,800,0.667500,[97.5096559 93.19100421 86.4288764 ],97.509656,93.191004,86.428876,14.416226,-57.622620,"[0.06176498127340824, 0.027186329588014983, 0...."
2,cd41bf948edc0340,1,cd41bf948edc0340.jpg,800,512,1.562500,[89.43367188 83.05516602 74.40758057],89.433672,83.055166,74.407581,8.051970,-53.929276,"[0.05924560546875, 0.03013427734375, 0.0246411..."
3,fb09f1e98c6d2f70,1,fb09f1e98c6d2f70.jpg,532,800,0.665000,[107.91263863 106.76824483 109.26745771],107.912639,106.768245,109.267458,4.627127,-54.376423,"[0.050286654135338345, 0.02482142857142857, 0...."
4,25c9dfc7ea69838d,7,25c9dfc7ea69838d.jpg,800,600,1.333333,[132.4216875 137.05765 144.99947083],132.421687,137.057650,144.999471,-8.277364,-25.763720,"[0.036810416666666665, 0.022747916666666666, 0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1580465,72c3b1c367e3d559,203092,72c3b1c367e3d559.jpg,800,533,1.500938,[128.88117026 127.38813086 116.54751407],128.881170,127.388131,116.547514,7.850999,29.970839,"[0.049305816135084425, 0.03048780487804878, 0...."
1580466,7a6a2d9ea92684a6,203092,7a6a2d9ea92684a6.jpg,800,532,1.503759,[127.55681156 124.83408835 114.69496711],127.556812,124.834088,114.694967,-0.866286,31.105888,"[0.04551691729323308, 0.025406484962406016, 0...."
1580467,9401fad4c497e1f9,203092,9401fad4c497e1f9.jpg,800,533,1.500938,[129.93018058 130.93818246 127.94207083],129.930181,130.938182,127.942071,-10.209307,-1.973596,"[0.039880393996247654, 0.023184803001876173, 0..."
1580468,aacc960c9a228b5f,203092,aacc960c9a228b5f.jpg,800,533,1.500938,[142.47123358 142.97529784 136.52950516],142.471234,142.975298,136.529505,70.599495,50.996376,"[0.03226547842401501, 0.020316604127579737, 0...."
