# Training Dataset Creation

In [1]:
import os
import sys
import pathlib

from torch.utils.data import DataLoader
from torchvision.io import read_image
from torchvision.transforms import Resize
import matplotlib.pyplot as plt
import pandas as pd
import torch

sys.path.insert(0, '..')

## Walk through Train Directory

In [2]:
paths = []

for (dirpath, dirnames, filenames) in os.walk('../data/Training Data'):
    for filename in filenames:
        extension = pathlib.Path(filename).suffix
        if extension == '.jpg':
            path = os.path.join(dirpath, filename)
            path = os.path.normcase(path)
            paths.append(path)
            print(path)

..\data\training data\backyard\seattle_3269390_1.jpg
..\data\training data\backyard\seattle_3269390_2.jpg
..\data\training data\backyard\seattle_3269390_3.jpg
..\data\training data\backyard\seattle_6362362_1.jpg
..\data\training data\backyard\seattle_6362362_2.jpg
..\data\training data\backyard\seattle_6424982_1.jpg
..\data\training data\backyard\seattle_6424982_2.jpg
..\data\training data\basement\seattle_9550869_1.jpg
..\data\training data\basement\seattle_9550869_2.jpg
..\data\training data\bathroom\boston_15543904_1.jpg
..\data\training data\bathroom\boston_15543904_2.jpg
..\data\training data\bathroom\boston_15843621_2.jpg
..\data\training data\bathroom\boston_1584362_1.jpg
..\data\training data\bathroom\boston_16559025_1.jpg
..\data\training data\bathroom\boston_16559025_2.jpg
..\data\training data\bathroom\boston_1696090_1.jpg
..\data\training data\bathroom\boston_1696090_2.jpg
..\data\training data\bathroom\boston_1783990_1.jpg
..\data\training data\bathroom\boston_1783990_2.jp

In [3]:
os.path.basename('..\data\training data\kitchen\seattle_5680462_4.jpg')

'seattle_5680462_4.jpg'

## Functions that Determine Label

In [4]:
def is_duplicate(path_1, path_2):
    filename_1_parts = os.path.basename(path_1).split('_')
    filename_2_parts = os.path.basename(path_2).split('_')
    # {city}_{roomid}_{match number}.jpg
    return filename_1_parts[0] == filename_2_parts[0] and filename_1_parts[1] == filename_2_parts[1]

def get_class(path_1, path_2):
    return int(is_duplicate(path_1, path_2))

In [5]:
assert is_duplicate('..\data\training data\kitchen\seattle_5680462_1.jpg', '..\data\training data\kitchen\seattle_5680462_2.jpg')

## Training Dataset Creation (Pandas)

In [6]:
def create_training_dataset(paths):
    image1s = []
    image2s = []
    classes = []
    for path1 in paths:
        for path2 in paths:
            image1s.append(path1)
            image2s.append(path2)
            classes.append(get_class(path1, path2))
    return pd.DataFrame({
        'image1': image1s,
        'image2': image2s,
        'class': classes
    })

In [7]:
train_df = create_training_dataset(paths)
train_df.shape

(398161, 3)

In [8]:
train_df.sample(20)

Unnamed: 0,image1,image2,class
315212,..\data\training data\living-room\boston_28542...,..\data\training data\kitchen\boston_17318669_...,0
337939,..\data\training data\living-room\boston_53913...,..\data\training data\kitchen\boston_20309505_...,0
59482,..\data\training data\bedroom\boston_1783990_1...,..\data\training data\bedroom\boston_4938870_2...,0
372615,..\data\training data\living-room\seattle_2788...,..\data\training data\kitchen\boston_1291216_1...,0
186603,..\data\training data\house-exterior\boston_40...,..\data\training data\living-room\boston_16288...,0
294187,..\data\training data\living-room\boston_18670...,..\data\training data\bedroom\boston_319826579...,0
330386,..\data\training data\living-room\boston_45582...,..\data\training data\kitchen\boston_4090224_1...,0
144305,..\data\training data\bedroom\seattle_6362362_...,..\data\training data\kitchen\seattle_9550869_...,0
234000,..\data\training data\kitchen\boston_28817183_...,..\data\training data\living-room\boston_48508...,0
32929,..\data\training data\bedroom\boston_12179994_...,..\data\training data\bedroom\boston_22327141_...,0


In [9]:
train_df['class'].value_counts()

0    394396
1      3765
Name: class, dtype: int64

## Import `DuplicateImageDataset` Class and Compare

- Above code is ported to `DuplicateImageDataset` that inherits from `torch.utils.data.Dataset`

In [10]:
from scripts.dataset_duplicate_image import DuplicateImageDataset

In [11]:
dataset = DuplicateImageDataset('../data/Training Data')

In [12]:
assert len(dataset) == len(train_df)

In [13]:
assert torch.all(torch.eq(dataset[133445][1], read_image(train_df.iloc[133445, 1])))

## Wrap `DuplicateImageDataset` in DataLoader

In [14]:
train_dataset = DuplicateImageDataset('../data/Training Data', transforms=[Resize((960, 1280), antialias=True)])
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [15]:
train_img_1s, train_img_2s, train_labels = next(iter(train_dataloader))
print('Image 1s shape:', train_img_1s.size())
print('Image 2s shape:', train_img_2s.size())
print('Labels shape', train_labels.size())

Image 1s shape: torch.Size([64, 3, 960, 1280])
Image 2s shape: torch.Size([64, 3, 960, 1280])
Labels shape torch.Size([64])
