In [169]:
import pandas as pd
import os
import numpy as np
import urllib.request
from shutil import copy, rmtree
from tqdm import tqdm_notebook

In [83]:
CLEAN_TRAIN_TEST_FOLDERS = True

In [104]:
main_folder = os.path.join('data')

data_folder = os.path.join(main_folder, 'images')
train_folder = os.path.join(main_folder, 'train') 
test_folder = os.path.join(main_folder, 'test')

# Download Images

In [48]:
boxable = pd.read_csv('data/train-images-boxable.csv')
descr = pd.read_csv('data/classes-description.csv', header=None)
bbox_annotation = pd.read_csv('data/train-annotations-bbox.csv')

In [142]:
descr

Unnamed: 0,0,1
0,/m/061hd_,Infant bed
1,/m/06m11,Rose
2,/m/03120,Flag
3,/m/01kb5b,Flashlight
4,/m/0120dh,Sea turtle
...,...,...
495,/m/05vtc,Potato
496,/m/02w3r3,Paper towel
497,/m/054xkw,Lifejacket
498,/m/01bqk0,Bicycle wheel


In [143]:
classes = descr[:3][0].tolist()
classes

['/m/061hd_', '/m/06m11', '/m/03120']

In [54]:
bbox_annotation.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
0,000002b66c9c498e,xclick,/m/01g317,1,0.0125,0.195312,0.148438,0.5875,0,1,0,0,0
1,000002b66c9c498e,xclick,/m/01g317,1,0.025,0.276563,0.714063,0.948438,0,1,0,0,0
2,000002b66c9c498e,xclick,/m/01g317,1,0.151562,0.310937,0.198437,0.590625,1,0,0,0,0
3,000002b66c9c498e,xclick,/m/01g317,1,0.25625,0.429688,0.651563,0.925,1,0,0,0,0
4,000002b66c9c498e,xclick,/m/01g317,1,0.257812,0.346875,0.235938,0.385938,1,0,0,0,0


In [64]:
number_pic = 500
clas = '/m/03120'

for label in classes:
    images_id = bbox_annotation[bbox_annotation.LabelName == label]['ImageID'].tolist()
    imgs_len = len(images_id)
    
    for img_ind in tqdm_notebook(range(number_pic), desc='NUMBER OF PICTURES'):
        img = images_id[img_ind]
        img = img + '.jpg'
        
        url = boxable[boxable.image_name == img]['image_url'].iloc[0]
        urllib.request.urlretrieve(url, os.path.join(data_folder, img))

HBox(children=(IntProgress(value=0, description='NUMBER OF PICTURES', max=500, style=ProgressStyle(description…




# Create train and test data

In [173]:
images = [os.path.join(data_folder, img_path) for img_path in os.listdir(data_folder)]
np.random.shuffle(images)

In [174]:
border = int(len(images)*0.9)

train = images[:border]
test = images[border:]

In [175]:
print(f'Train images size: {len(train)} \n Test images size: {len(test)}')

Train images size: 709 
 Test images size: 79


In [172]:
if CLEAN_TRAIN_TEST_FOLDERS:
    rmtree(train_folder)
    rmtree(test_folder)

In [176]:
if not os.path.exists(train_folder):
    os.mkdir(train_folder)
    
if not os.path.exists(test_folder):
    os.mkdir(test_folder)
    
len_train = len(train)
len_test = len(test)
for img_ind in tqdm_notebook(range(len_train), desc='TRAIN COPY'):
    img_path = train[img_ind]
    copy(img_path, train_folder)
    
for img_ind in tqdm_notebook(range(len_test), desc='TEST COPY'):
    img_path = test[img_ind]
    copy(img_path, test_folder)

HBox(children=(IntProgress(value=0, description='TRAIN COPY', max=709, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='TEST COPY', max=79, style=ProgressStyle(description_width='in…




In [177]:
train = [img.split('\\')[-1].replace('.jpg', '') for img in train]
test = [img.split('\\')[-1].replace('.jpg', '') for img in test]

In [178]:
y_train = bbox_annotation[bbox_annotation['LabelName'].isin(classes)]
y_test = bbox_annotation[bbox_annotation['LabelName'].isin(classes)]

In [180]:
y_train = y_train[y_train.ImageID.isin(train)]
y_test = y_test[y_test.ImageID.isin(test)]

In [181]:
y_train = y_train.rename(columns={'ImageID': 'filename', 'LabelName': 'class', 
                        'XMin': 'xmin', 'XMax': 'xmax', 'YMin': 'ymin', 'YMax': 'ymax'})

y_test = y_test.rename(columns={'ImageID': 'filename', 'LabelName': 'class', 
                                    'XMin': 'xmin', 'XMax': 'xmax', 'YMin': 'ymin', 'YMax': 'ymax'})

In [182]:
y_train['width'], y_train['height'] = 1024, 1024
y_test['width'], y_test['height'] = 1024, 1024

In [183]:
y_train.filename = [img+'.jpg' for img in y_train.filename]
y_test.filename = [img+'.jpg' for img in y_test.filename]

In [184]:
y_train.head()

Unnamed: 0,filename,Source,class,Confidence,xmin,xmax,ymin,ymax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside,width,height
1896,0000f8604a4e2cfe.jpg,xclick,/m/03120,1,0.72375,0.86,0.154145,0.838083,1,0,0,0,0,1024,1024
3846,00027f4e7a1c370f.jpg,xclick,/m/03120,1,0.005757,0.306743,0.431875,0.650625,1,1,0,0,0,1024,1024
3847,00027f4e7a1c370f.jpg,xclick,/m/03120,1,0.148026,0.999178,0.0,0.278125,1,1,0,0,0,1024,1024
3848,00027f4e7a1c370f.jpg,xclick,/m/03120,1,0.904605,0.999178,0.42125,0.633125,1,1,0,0,0,1024,1024
4005,0002ae796e1f8eb5.jpg,xclick,/m/03120,1,0.046414,0.191983,0.532399,0.628722,1,0,0,1,0,1024,1024


In [185]:
y_train.to_csv(os.path.join(main_folder, 'train_labels.csv'), index=False, index_label=False)
y_test.to_csv(os.path.join(main_folder, 'test_labels.csv'), index=False, index_label=False)