In [1]:
import pandas as pd
import numpy as np
import ast
import os
from pascal_voc_writer import Writer

## Setup

In [2]:
# File with object annotations
da = pd.read_csv('DaycaresAnnotations_original.csv')

# Filepaths to folders
filepath_to_daycares = 'Daycares/'

# Name of file that will have annotations
save_to_file = 'annotate.txt'

## Parse

In [3]:
# Annotated output
output_annot = []

###########################################################################
# DAYCARES
for r in range(len(da)):
    # Check if it has an object
    if da.loc[r, 'region_shape_attributes'] != "{}":
        # filename
        filepath = da.loc[r, 'filename']

        # Get rect attributes
        rect = ast.literal_eval(da.loc[r, 'region_shape_attributes'])
        
        # x-min
        xmin = int(rect['x'] - np.floor(rect['width']/2))
        # x-max
        xmax = int(rect['x'] + np.floor(rect['width']/2))
        # y-min
        ymin = int(rect['y'] - np.floor(rect['height']/2))
        # y-max
        ymax = int(rect['y'] + np.floor(rect['height']/2))
        
        # class_name ("1" = Daycare, "0" = NonDaycare)
        classname = "1"
        
        current_annot = filepath + "," + str(xmin) + "," + str(ymin) + "," + str(xmax) + "," + str(ymax) + "," + classname
        
        output_annot.append(current_annot)

# Remove duplicates
output_annot = list(set(output_annot))

## Check output

In [4]:
len(output_annot)

2539

In [5]:
output_annot # xmin, ymin, xmax, ymax

['1781_20100420.jpg,634,483,730,555,1',
 '17566_20151112.jpg,542,410,624,500,1',
 '1752_20170616.jpg,724,365,794,477,1',
 '18929_20141114.jpg,469,648,571,724,1',
 '1481_20150615.jpg,659,430,719,508,1',
 '1765_20160305.jpg,738,160,818,220,1',
 '1913_20170818.jpg,780,614,888,696,1',
 '19309_20171112.jpg,890,382,964,476,1',
 '19310_20161107.jpg,486,-24,568,80,1',
 '19194_20150627.jpg,832,391,922,463,1',
 '14716_20160313.jpg,1066,824,1206,902,1',
 '19117_20160311.jpg,228,533,346,631,1',
 '1431_20131030.jpg,1052,612,1102,678,1',
 '1771_20140426.jpg,741,370,841,446,1',
 '12753_20131203.jpg,652,368,784,444,1',
 '13078_20170805.jpg,107,400,235,498,1',
 '13337_20140314.jpg,675,97,785,183,1',
 '19260_20180506.jpg,781,350,869,412,1',
 '1980_20150513.jpg,607,283,691,407,1',
 '1771_20170725.jpg,740,369,840,445,1',
 '17471_20171113.jpg,801,394,881,478,1',
 '1412_20171013.jpg,565,494,631,578,1',
 '19260_20160720.jpg,781,350,869,412,1',
 '1994_20151006.jpg,688,475,794,559,1',
 '2018_20180719.jpg,458,2

## Save to file

In [6]:
# Save to FILE
with open(save_to_file, 'w') as f:
    for item in output_annot:
        f.write("%s\n" % item)

### Cleaning

In [7]:
ann = pd.read_csv(save_to_file,header=None)

In [8]:
ann

Unnamed: 0,0,1,2,3,4,5
0,1781_20100420.jpg,634,483,730,555,1
1,17566_20151112.jpg,542,410,624,500,1
2,1752_20170616.jpg,724,365,794,477,1
3,18929_20141114.jpg,469,648,571,724,1
4,1481_20150615.jpg,659,430,719,508,1
5,1765_20160305.jpg,738,160,818,220,1
6,1913_20170818.jpg,780,614,888,696,1
7,19309_20171112.jpg,890,382,964,476,1
8,19310_20161107.jpg,486,-24,568,80,1
9,19194_20150627.jpg,832,391,922,463,1


In [9]:
# Remove negatives
ann = ann.loc[ann[1] > 0]
ann = ann.loc[ann[2] > 0]
ann = ann.loc[ann[3] > 0]
ann = ann.loc[ann[4] > 0]
ann = ann.loc[ann[5] > 0]
ann[5] = 'daycare'
ann.shape

(2417, 6)

In [10]:
# Remove duplicates
dup = ann.groupby(0).count()[5]
ann = ann.loc[ann[0].isin(dup[dup < 2].index)]
ann.shape

(2364, 6)

In [11]:
print(ann.shape[0])
print(ann.shape[0]*0.8)

2364
1891.2


In [12]:
ann.iloc[0:3,]

Unnamed: 0,0,1,2,3,4,5
0,1781_20100420.jpg,634,483,730,555,daycare
1,17566_20151112.jpg,542,410,624,500,daycare
2,1752_20170616.jpg,724,365,794,477,daycare


### Sampling

In [13]:
import random
random.seed(1)
total_size = ann.shape[0]
train_prop = 0.80
train = ann.iloc[random.sample(range(total_size), int(total_size*train_prop))]
test = pd.concat([ann,train]).drop_duplicates(keep=False)

In [14]:
dc = [f for f in os.listdir(filepath_to_daycares) if f.endswith('.jpg')]
len(dc)

2540

In [15]:
import os

for r in range(train.shape[0]):
    f = train.iloc[r, 0] # filename
    os.system("cp '" + os.getcwd() + "/" + filepath_to_daycares + f + "' '" + os.getcwd() + "/train/" + f + "' ")
    
for r in range(test.shape[0]):
    f = test.iloc[r, 0] # filename
    os.system("cp '" + os.getcwd() + "/" + filepath_to_daycares + f + "' '" + os.getcwd() + "/test/" + f + "' ")

In [16]:
train = [f for f in os.listdir("train/") if f.endswith('.jpg')]
test = [f for f in os.listdir("test/") if f.endswith('.jpg')]

In [17]:
ann[ann[0].isin(train+test)].shape

(2364, 6)

# END