In [1]:
import glob
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import itertools
import time
import pandas as pd
import os
from pathlib import Path
from datetime import datetime
from tqdm.auto import tqdm
import wandb

## EDA

In [32]:
paths = glob.glob("../dataset/images*/*.jpg")
names = list(map(lambda path: path.split('/')[-1], paths))

In [33]:
def get_dimensions(image_path: str):
    try:
        im = Image.open(image_path)
    except:
        return (1, 1)
    width, height = im.size
    del im
    return (width, height)

In [30]:
dimensions = list(map(get_dimensions, paths))
aspect_ratio = list(map(lambda x: x[0] / x[1], dimensions))

In [None]:
dims = pd.DataFrame({
    "name": names,
    "dimensions": dimensions,
    "aspect_ratio": aspect_ratio
})

In [None]:
dims['dimensions'].value_counts(normalize=True)

dimensions
(800, 600)     0.421238
(600, 800)     0.261531
(525, 700)     0.066371
(933, 700)     0.058657
(1280, 960)    0.013475
                 ...   
(952, 625)     0.000005
(519, 768)     0.000005
(1200, 810)    0.000005
(828, 317)     0.000005
(1400, 908)    0.000005
Name: proportion, Length: 4740, dtype: float64

In [None]:
dims['aspect_ratio'].value_counts(normalize=True)

aspect_ratio
1.333333    0.443474
0.750000    0.350599
1.332857    0.058662
1.500586    0.011182
1.000000    0.010357
              ...   
1.512915    0.000005
0.919558    0.000005
1.835000    0.000005
1.408083    0.000005
1.541850    0.000005
Name: proportion, Length: 4092, dtype: float64

## Cleaning from broken images

In [2]:
DATA_PATH = Path("../data")
COMP_DATA_PATH = Path("../data")
IMAGE_PATH = Path("../dataset")

TEST_SET = COMP_DATA_PATH / "test-data.csv"
TRAIN_SPLIT = DATA_PATH / "train_split_20perval_grouped_stratified.csv"
VAL_SPLIT = DATA_PATH / "val_split_20perval_grouped_stratified.csv"

IMG_GLOB = "images*/*.jpg"
def bind_fs(df, path: Path, glob: str):
    mapping = {x.name: x for x in path.glob(glob)}
    return df.applymap(lambda x: mapping.get(x))
val_df = pd.read_csv(VAL_SPLIT)
val_df[["image_path1", "image_path2"]] = bind_fs(val_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

train_df = pd.read_csv(TRAIN_SPLIT)
train_df[["image_path1", "image_path2"]] = bind_fs(train_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

test_df = pd.read_csv(TEST_SET)
test_df[["image_url1", "image_url2"]] = test_df[["image_url1", "image_url2"]].applymap(lambda x: x.rsplit("/", 1)[-1])
test_df[["image_path1", "image_path2"]] = bind_fs(test_df.filter(like="image_url"), IMAGE_PATH, IMG_GLOB)

In [None]:
def filter_broken_imgs(row):
    try:
        im1 = Image.open(row["image_path1"])
        im2 = Image.open(row["image_path2"])
        del im1
        del im2
    except:
        return False
    return True

In [None]:
train_df_filtered = train_df[train_df.apply(filter_broken_imgs, axis=1)]
val_df_filtered = val_df[val_df.apply(filter_broken_imgs, axis=1)]

In [None]:
len(train_df), len(train_df_filtered), len(val_df), len(val_df_filtered)

(72487, 71385, 18151, 17872)

In [None]:
train_df_filtered.to_csv("../data/train_split_filtered.csv", index=False)
val_df_filtered.to_csv("../data/val_split_filtered.csv", index=False)

## Dataloader

In [3]:
%run ./12-siamese-network/code.py

In [4]:
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torchvision import transforms
from torchvision.transforms import ToTensor
import torch
import torch.nn.functional as F
import torch.nn as nn

In [5]:
BATCH_SIZE = 4
IMG_SIZE = (512, 512)
EPOCHS = 5

In [6]:
TRAIN_SPLIT_FILTERED_PATH = DATA_PATH / "train_split_filtered.csv"
VAL_SPLIT_FILTERED_PATH = DATA_PATH / "val_split_filtered.csv"

In [7]:
train_split_filtered = pd.read_csv(TRAIN_SPLIT_FILTERED_PATH)
val_split_filtered = pd.read_csv(VAL_SPLIT_FILTERED_PATH)

In [8]:
train_transforms = transforms.Compose(
    [
        transforms.Resize((IMG_SIZE)),
        transforms.ToTensor()
    ]
)
val_transforms = transforms.Compose(
    [
        transforms.Resize((IMG_SIZE)),
        transforms.ToTensor()
    ]
)

In [9]:
train_dataset = SiameseNetworkDataset(train_split_filtered, transform=train_transforms)
val_dataset = SiameseNetworkDataset(val_split_filtered, transform=val_transforms)

In [10]:
# train_dataset = SiameseNetworkDataset(train_split_filtered.iloc[:64], transform=train_transforms)
# val_dataset = SiameseNetworkDataset(val_split_filtered.iloc[:64], transform=val_transforms)

In [11]:
train_dataset[5][0].shape

torch.Size([3, 512, 512])

In [12]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [13]:
# for batch in train_dataloader:
#     print(batch[0].shape, batch[1].shape, batch[2].shape)
#     break

## Network

In [14]:
net = SiameseNetwork()

In [15]:
net

SiameseNetwork(
  (cnn): Sequential(
    (0): Conv2d(3, 16, kernel_size=(11, 11), stride=(1, 1), padding=same)
    (1): ReLU()
    (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=same)
    (5): ReLU()
    (6): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (9): ReLU()
    (10): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (13): ReLU()
    (14): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_sta

In [16]:
for batch in train_dataloader:
    img1, img2, label = batch
    print(img1.shape, img2.shape, label.shape)
    output = net(img1, img2)
    print(output[0].shape, output[1].shape)
    break

torch.Size([4, 3, 512, 512]) torch.Size([4, 3, 512, 512]) torch.Size([4])
torch.Size([4, 1]) torch.Size([4, 1])


## Train

In [17]:
wandb.init(project="csc_hackathon_lun")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnikita-fordui[0m ([33mcsc_hackathon_lun[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [19]:
net = SiameseNetwork().to(device)
loss_fn = ContrastiveLoss()
optimizer = torch.optim.Adam(net.parameters(),lr = 0.0005)

In [20]:
def train_one_epoch(epoch_index):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        # Every data instance is an input + label pair
        input1, input2, labels = data

        # to gpu
        input1 = input1.to(device)
        input2 = input2.to(device)
        labels = labels.to(device)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        output1, output2 = net(input1, input2)

        # Compute the loss and its gradients
        loss = loss_fn(output1, output2, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % len(train_dataloader) == (len(train_dataloader) - 1):
            last_loss = running_loss / (len(train_dataloader) - 1): # loss per batch
            tb_x = epoch_index * len(train_dataloader) + i + 1
            running_loss = 0.

    return last_loss

In [21]:
best_vloss = 1_000_000.
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
for epoch_number in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    net.train(True)
    avg_loss = train_one_epoch(epoch_number)


    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    net.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(val_dataloader):
            vinput1, vinput2, vlabels = vdata
            # to gpu
            vinput1 = vinput1.to(device)
            vinput2 = vinput2.to(device)
            vlabels = vlabels.to(device)
            voutput1, voutput2 = net(vinput1, vinput2)
            vloss = loss_fn(voutput1, voutput2, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(net.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:


  0%|          | 0/17847 [00:00<?, ?it/s]

  return F.conv2d(input, weight, bias, self.stride,


KeyboardInterrupt: 