In [1]:
import tarfile
import os
import pickle

import cv2

import matplotlib.pyplot as plt
import matplotlib.patches as patches
from mpl_toolkits.mplot3d.axes3d import Axes3D
import numpy as np
import pandas as pd

import xml.etree.ElementTree as ET



%matplotlib inline

In [2]:
def parse_file(filename):
    """ Parse a PASCAL VOC xml file """
    tree = ET.parse(filename)
    objects = []
    for obj in tree.findall('object'):
        obj_struct = {}
        obj_struct['name'] = obj.find('name').text
        bbox = obj.find('bndbox')
        obj_struct['bbox'] = [int(float(bbox.find('xmin').text)),
                              int(float(bbox.find('ymin').text)),
                              int(float(bbox.find('xmax').text))-int(float(bbox.find('xmin').text)),
                              int(float(bbox.find('ymax').text))-int(float(bbox.find('ymin').text))]
        objects.append(obj_struct)

    return objects

In [18]:
data_path='Data/'
img_path='images/'
annotations_path='annotations/'
train_path='train/'
test_path='test/'
val_path='validation/'

Create a dataframe to store the cooridnates of bounding box for each image

In [30]:
def data_to_df(path_images, path_annotations):
    """
    Load data annotations coordinates into a dataframe
    """
    
    df= pd.DataFrame()
    im_names=os.listdir(path_images)
    im_names=[name[:-4] for name in im_names] #get rid of the .jpg
    
    i=0
    for name in im_names:
        anno=parse_file(os.path.join(path_annotations, name) + '.xml')
        for dic in anno:
            df.loc[i,'filename']=name
            df.loc[i,'xmin']=dic['bbox'][0]
            df.loc[i,'ymin']=dic['bbox'][1]
            df.loc[i,'xmax']=dic['bbox'][2] + dic['bbox'][0]
            df.loc[i,'ymax']=dic['bbox'][3] + dic['bbox'][1]
            i+=1

    return df

In [39]:
tr_anno=data_path+annotations_path +train_path
df_train=data_to_df(data_path+img_path+train_path, tr_anno)

with open('df_train.pickle', 'wb') as df:
    pickle.dump(df_train, df, protocol=pickle.HIGHEST_PROTOCOL)

In [40]:
te_anno=data_path+annotations_path +test_path
df_test=data_to_df(data_path+img_path+test_path, te_anno)

with open('df_test.pickle', 'wb') as df:
    pickle.dump(df_test, df, protocol=pickle.HIGHEST_PROTOCOL)

In [41]:
val_anno=data_path+annotations_path +val_path
df_val=data_to_df(data_path+img_path+val_path, val_anno)

with open('df_val.pickle', 'wb') as df:
    pickle.dump(df_val, df, protocol=pickle.HIGHEST_PROTOCOL)

In [42]:
print('Nb of train images :',df_train.filename.nunique())
print('Nb of test images :',df_test.filename.nunique())
print('Nb of validation images :',df_val.filename.nunique())

Nb of train images : 532
Nb of test images : 33
Nb of validation images : 103


# For R-CNN potential model

In [12]:
def formating(data_dir,row):
    """
    Puts the annotations and the filename in the correct format
    for RNN training:
        filepath,x1,y1,x2,y2,class_name
    """
    anno=data_dir+row[0]+'.jpg,'+str(int(row[1]))+','+\
        str(int(row[2]))+','+str(int(row[3]))+','+str(int(row[4]))+',Varroa'
    return anno

In [13]:
train_dir=data_path+img_path+train_path
train_anno=pd.DataFrame()
train_anno['format']=df_train.apply(lambda row: formating('../'+train_dir, row), axis=1)

In [14]:
train_anno.head()

Unnamed: 0,format
0,../Data/images/train/59023fbd579e52581ddede9f_...
1,../Data/images/train/59023fbd579e52581ddede9f_...
2,../Data/images/train/59023fbd579e52581ddede9f_...
3,../Data/images/train/59023fbd579e52581ddede9f_...
4,../Data/images/train/59023fbd579e52581ddede9f_...


In [15]:
train_anno.to_csv('annotate.txt', header=None, index=None, sep=' ')