# Dataset Creation

In [1]:
import os
import pandas as pd
import webdataset as wds

from PIL import Image

## Setup

In [2]:
root = '.'

label_file = f'{root}/proj_labels.csv'
image_dir = f'{root}/map-proj-v3_2'
data_folder = f'{root}/data'

## Load Data

In [18]:
df = pd.read_csv(label_file)

For some odd reason they duplicated the entries for some of the labels so were dropping duplicates

In [19]:
df = df.drop_duplicates().reset_index(drop=True)

In [20]:
df.head()

Unnamed: 0,image,label,type
0,ESP_013049_0950_RED-0067.jpg,7,train
1,ESP_013049_0950_RED-0067-fv.jpg,7,train
2,ESP_013049_0950_RED-0067-brt.jpg,7,train
3,ESP_013049_0950_RED-0067-r90.jpg,7,train
4,ESP_013049_0950_RED-0067-r180.jpg,7,train


In [21]:
df['type'].unique()

array(['train', 'val', 'test'], dtype=object)

## Write Dataset

In [22]:
def get_baseimg(img: str) -> str:
    img = os.path.splitext(img)[0]
    parts = img.split('-')
    return f'{parts[0]}-{parts[1]}.jpg'

def get_transform(img: str) -> str:
    img = os.path.splitext(img)[0]
    ttype = img.split('-')[-1]
    return None if ttype.isdigit() else ttype

In [23]:
def prepare_img(img_dir: str, img_file: str, label: int) -> dict:
    return {
        '__key__': os.path.splitext(img_file)[0],
        'jpg': Image.open(os.path.join(img_dir, img_file)).convert('L'),
        'json': {
            'ttype': get_transform(img_file),
            'base': get_baseimg(img_file),
            'label': label,
            'image': img_file
        }
    }

In [24]:
train_writer = wds.ShardWriter(f'file:{data_folder}/train-%06d.tar', maxcount=15_000, compress=True)
val_writer = wds.ShardWriter(f'file:{data_folder}/val-%06d.tar', maxcount=15_000, compress=True)
test_writer = wds.ShardWriter(f'file:{data_folder}/test-%06d.tar', maxcount=15_000, compress=True)

# writing file:./data/train-000000.tar 0 0.0 GB 0
# writing file:./data/val-000000.tar 0 0.0 GB 0
# writing file:./data/test-000000.tar 0 0.0 GB 0


In [25]:
# hacky but easier than simpler than building a class to hold the writers and perform the routing
def get_writer(type: str):
    if type == 'train':
        return train_writer
    if type == 'val':
        return val_writer

    return test_writer

In [26]:
for _, row in df.iterrows():
    data = prepare_img(image_dir, row['image'], row['label'])
    get_writer(row['type']).write(data)

## Cleanup

In [27]:
train_writer.close()
val_writer.close()
test_writer.close()