In [25]:
import pandas as pd
import numpy as np
import ast
import os
from pascal_voc_writer import Writer

## Setup

In [26]:
# File with object annotations
da = pd.read_csv('DaycaresAnnotations_original.csv')

# Filepaths to folders
filepath_to_daycares = 'Daycares/'

# Name of file that will have annotations
save_to_file = 'annotate.txt'

## Parse

In [27]:
# Annotated output
output_annot = []

###########################################################################
# DAYCARES
for r in range(len(da)):
    # Check if it has an object
    if da.loc[r, 'region_shape_attributes'] != "{}":
        # filename
        filepath = da.loc[r, 'filename']

        # Get rect attributes
        rect = ast.literal_eval(da.loc[r, 'region_shape_attributes'])
        
        # x-min
        xmin = int(rect['x'])
        # x-max
        xmax = int(rect['x'] + rect['width'])
        # y-min
        ymin = int(rect['y'])
        # y-max
        ymax = int(rect['y'] + rect['height'])
        
        # class_name ("1" = Daycare, "0" = NonDaycare)
        classname = "1"
        
        current_annot = filepath + "," + str(xmin) + "," + str(ymin) + "," + str(xmax) + "," + str(ymax) + "," + classname
        
        output_annot.append(current_annot)

# Remove duplicates
output_annot = list(set(output_annot))

## Check output

In [28]:
len(output_annot)

2539

In [29]:
output_annot # xmin, ymin, xmax, ymax

['1910_20160117.jpg,605,209,686,318,1',
 '1505_20110904.jpg,1176,391,1262,452,1',
 '1910_20120812.jpg,605,209,686,318,1',
 '18470_20180531.jpg,690,277,772,364,1',
 '1714_20161009.jpg,178,231,274,299,1',
 '1892_20140201.jpg,677,606,696,634,1',
 '18926_20131004.jpg,640,463,727,552,1',
 '18927_20160719.jpg,627,679,707,767,1',
 '1781_20170808.jpg,682,519,778,591,1',
 '1752_20160414.jpg,754,422,825,534,1',
 '14716_20150615.jpg,1113,863,1253,941,1',
 '19117_20151128.jpg,287,582,406,681,1',
 '1742_20150419.jpg,762,403,843,502,1',
 '1713_20160625.jpg,814,474,875,569,1',
 '19310_20171112.jpg,658,303,738,398,1',
 '1980_20160313.jpg,649,345,733,470,1',
 '19310_20150419.jpg,656,294,736,389,1',
 '1481_20150615.jpg,689,469,750,548,1',
 '1853_20140316.jpg,664,381,770,457,1',
 '2080_20140407.jpg,660,202,743,306,1',
 '1980_20140120.jpg,649,345,733,470,1',
 '1747_20130331.jpg,638,482,709,588,1',
 '18926_20140902.jpg,653,463,740,552,1',
 '19212_20150519.jpg,748,571,851,657,1',
 '1693_20130201.jpg,694,584

## Save to file

In [30]:
# Save to FILE
with open(save_to_file, 'w') as f:
    for item in output_annot:
        f.write("%s\n" % item)

### Cleaning

In [31]:
ann = pd.read_csv(save_to_file,header=None)

In [32]:
ann

Unnamed: 0,0,1,2,3,4,5
0,1910_20160117.jpg,605,209,686,318,1
1,1505_20110904.jpg,1176,391,1262,452,1
2,1910_20120812.jpg,605,209,686,318,1
3,18470_20180531.jpg,690,277,772,364,1
4,1714_20161009.jpg,178,231,274,299,1
5,1892_20140201.jpg,677,606,696,634,1
6,18926_20131004.jpg,640,463,727,552,1
7,18927_20160719.jpg,627,679,707,767,1
8,1781_20170808.jpg,682,519,778,591,1
9,1752_20160414.jpg,754,422,825,534,1


In [33]:
# Remove negatives
ann = ann.loc[ann[1] > 0]
ann = ann.loc[ann[2] > 0]
ann = ann.loc[ann[3] > 0]
ann = ann.loc[ann[4] > 0]
ann = ann.loc[ann[5] > 0]
ann[5] = 'daycare'
ann.shape

(2528, 6)

In [34]:
# Remove duplicates
dup = ann.groupby(0).count()[5]
ann = ann.loc[ann[0].isin(dup[dup < 2].index)]
ann.shape

(2439, 6)

In [35]:
print(ann.shape[0])
print(ann.shape[0]*0.8)

2439
1951.2


In [36]:
ann.iloc[0:3,]

Unnamed: 0,0,1,2,3,4,5
0,1910_20160117.jpg,605,209,686,318,daycare
1,1505_20110904.jpg,1176,391,1262,452,daycare
2,1910_20120812.jpg,605,209,686,318,daycare


### Sampling

In [37]:
import random
random.seed(1)
total_size = ann.shape[0]
train_prop = 0.80
train = ann.iloc[random.sample(range(total_size), int(total_size*train_prop))]
test = pd.concat([ann,train]).drop_duplicates(keep=False)

In [38]:
dc = [f for f in os.listdir(filepath_to_daycares) if f.endswith('.jpg')]
len(dc)

2540

In [39]:
import os

for r in range(train.shape[0]):
    f = train.iloc[r, 0] # filename
    os.system("cp '" + os.getcwd() + "/" + filepath_to_daycares + f + "' '" + os.getcwd() + "/train/" + f + "' ")
    
for r in range(test.shape[0]):
    f = test.iloc[r, 0] # filename
    os.system("cp '" + os.getcwd() + "/" + filepath_to_daycares + f + "' '" + os.getcwd() + "/test/" + f + "' ")

In [40]:
train = [f for f in os.listdir("train/") if f.endswith('.jpg')]
test = [f for f in os.listdir("test/") if f.endswith('.jpg')]

In [41]:
ann[ann[0].isin(train+test)].shape

(2439, 6)

# END

Files needed for SSD specifically

In [42]:
test = [f for f in os.listdir("test/") if f.endswith('.jpg')]
# Save to FILE
with open('test_rec.txt', 'w') as f:
    for item in test:
        f.write("%s\n" % item)

In [43]:
with open('annotations.txt', 'w') as f:
    for r in range(ann.shape[0]):
        output = ann.iloc[r, 0] + ",1501,944,daycare"
        output += "," + str(ann.iloc[r, 1])
        output += "," + str(ann.iloc[r, 2])
        output += "," + str(ann.iloc[r, 3])
        output += "," + str(ann.iloc[r, 4])
        f.write("%s\n" % output)

Manually add header on 'annotations.txt' and make it .csv

In [47]:
ann[ann[0] == '1781_20100420.jpg']

Unnamed: 0,0,1,2,3,4,5
2228,1781_20100420.jpg,682,519,778,591,daycare
