In [1]:
from glob import glob
from collections import Counter
import pandas as pd
import cv2
import os
import numpy as np
from tqdm import tqdm
from functools import partial
from joblib import Parallel, delayed
import sys


datasets = ['ice', 'rtsd']
val_folders = ['2018-02-16_1515_left', '2018-03-16_1424_left', '2018-03-23_1352_right']


def process_path(tpath, dataset):
    start = tpath.find('annotations') + len('annotations') + 1
    task = os.path.join(dataset, os.path.splitext(tpath[start:])[0])
    
    
    t = pd.read_csv(tpath, dtype=str, sep='\t')
    ext = t.ext[0] if 'ext' in t else 'jpg'
    
    impath = os.path.join(dataset, 'images', tpath[start:].replace('.tsv', '.' + ext))
    if not os.path.exists(impath):
        print(impath)
        sys.stdout.flush()
        1/0
    img = cv2.imread(impath)
    folder = 'val' if impath.split('/')[-2] in val_folders else 'train'
    
    for _, row in t.iterrows():
        try:
            class_name = '.'.join(row['class'].split('.')[:2])
        except AttributeError:
            class_name = 'other'
        xtl, ytl, xbr, ybr = map(lambda x: int(.5+float(x)), (row.xtl, row.ytl, row.xbr, row.ybr))
        sample = img[ytl:ybr,xtl:xbr]
        fname = 'classification_data/{}/{}/{}.jpg'.format(folder, class_name, np.random.randint(0,1000000000))
        os.makedirs(os.path.dirname(fname), exist_ok=True)
        cv2.imwrite(fname, sample)
        
        w = np.random.randint(21, 64)
        h = w + np.random.randint(-int(.5 * w), +int(0.5*w))
        xtl = np.random.randint(0,img.shape[1]-w-1)
        ytl = np.random.randint(0, img.shape[0]-h-1)
        other = img[ytl:ytl+h,xtl:xtl+w]
        if other.shape[0] ==0 or other.shape[1] == 0:
            print(w, h, xtl, ytl, img.shape)
            sys.stdout.flush()
            1/0
        fname = 'classification_data/{}/{}/{}.jpg'.format(folder, 'other', np.random.randint(0,1000000000))
        os.makedirs(os.path.dirname(fname), exist_ok=True)
        cv2.imwrite(fname, other)

def gather_examples(dataset):
    prefix = f'/media/grisha/hdd/icevision/{dataset}'
    Parallel(n_jobs=13)(delayed(partial(process_path, dataset=dataset))(p) for p in tqdm(glob(os.path.join(prefix, '**/annotations/**/*.tsv'), recursive=True)))
#     for tpath in glob(os.path.join(prefix, 'annotations/**/*.tsv'), recursive=True):
#         process_path(tpath, dataset)
#         print(tpath)
#         break

In [2]:
gather_examples('ice')

100%|██████████| 6787/6787 [00:36<00:00, 183.47it/s]


In [3]:
gather_examples('RTSD')

100%|██████████| 59188/59188 [01:07<00:00, 882.19it/s]


In [4]:
from shutil import move, rmtree
for folder in glob("classification_data/train/*"):
    valfolder = folder.replace('train', 'val')
    os.makedirs(valfolder, exist_ok=True)
    
    if len(glob(os.path.join(valfolder, '*'))) == 0:
        train_imgs = glob(os.path.join(folder, '*'))
        if len(train_imgs) < 5:
            rmtree(folder)
            rmtree(valfolder)
            continue
        np.random.shuffle(train_imgs)
        to_take = int(max(1, 0.05*len(train_imgs)))
        for path in train_imgs[:to_take]:
#             print(path)
#             print(os.path.basename(path.replace('train', 'val')))
            os.makedirs(os.path.dirname(path.replace('train', 'val')), exist_ok=True)
            move(path, path.replace('train', 'val'))

In [5]:
for folder in glob("classification_data/val/*"):
    trainfolder = folder.replace('val', 'train')
    
    if not os.path.exists(trainfolder):
        rmtree(folder)
        print(folder)

classification_data/val/1.3
classification_data/val/6.18
classification_data/val/6.12
