# Data preparation


## File list
The directory structure of the actual annotations is quite odd.

```
-- Annotation DIR
    |
    |
    |__CAT 1
    |   |
    |   |_d1
    |   |_d2
    |   |...
    |   |_dN
    |
    |__CAT 2
    |   |
    |   |_d1
    |   |_d2
    |   |...
    |   |_dN
    |
    |__CAT 3
    |   |
    |   |_d1
    |   |_d2
    |   |...
    |   |_dN
```


Below some path magics to extract the filenames for both: images and xml annotations.

In [None]:
from pathlib import Path, PosixPath
import itertools
from utility import xml2df

* set the `PATH` to the annotations directory 

In [None]:
annotations = '/home/epinux/annotate2/'

* Generate a list with the image files

In [None]:
jpeg_files = [str(i) for i in itertools.chain.from_iterable(
            [list(i.glob('*.jpg')) for i in itertools.chain.from_iterable(
                [sorted(i.glob('*')) for i in sorted(Path(annotations).glob('*'))])])]

In [None]:
len(jpeg_files)

In [None]:
#jpeg_files

* Generate a list with the xml annotations files

In [None]:
xml_files = [str(i) for i in itertools.chain.from_iterable(
            [list(i.glob('*.xml')) for i in itertools.chain.from_iterable(
                [sorted(i.glob('*')) for i in sorted(Path(annotations).glob('*'))])])]

In [None]:
len(xml_files)

## Subsampling

Reduce the number of sampled images. By applying a `reduce_factor` $R_f$ to the `xml` annotation file list.

Starting value:

$$
R_f=0.4 \quad \text{which will reduce the amount of files to 40% of the total}
$$

In [None]:
import numpy as np
reduce_factor = 0.4
np.random.seed(0)
msk = np.random.rand(len(xml_files), ) < reduce_factor
sample = list(np.array(xml_files)[msk])

In [None]:
len(sample)

In [None]:
sample[:5]

## Image collection

copy the images related to the annotation that are going to be used and store them in a zip file

In [None]:
!mkdir -p results/samples

In [None]:
!mkdir -p results/samples/train

In [None]:
!mkdir -p results/samples/test

In [None]:
!mkdir -p results/samples/validation

In [None]:
from shutil import copyfile

#for src in list(records['filename'].values):
for src_xml in sample:
    src_img = src_xml.replace('.xml', '.jpg')
    dst_img = 'results/samples' / Path(src_img.split('/')[-1])
    dst_xml = 'results/samples' / Path(src_xml.split('/')[-1])
    try:
        copyfile(src_img, dst_img)
        copyfile(src_xml, dst_xml)
    except:
        print('skipping: ', src, dst)

In [None]:
xml_list = !ls results/samples | grep '.xml'
jpg_list = !ls results/samples | grep '.jpg'
xml_list[:5], jpg_list[:5]
len(xml_list), len(jpg_list)

In [None]:
!mkdir -p results/samples/sd_annotation

## Annotation Parsing

Parsing each `xml` file and store the resutls as `pandas.Dataframe`


In [None]:
records = xml2df(xml_list)

In [None]:
records.shape

Get an idea of which labels are in all the annotations

In [None]:
list(records['class'].unique())

In [None]:
records.head()

In [None]:
records.describe()

In [None]:
%matplotlib inline

Simple statistic description of the sample

In [None]:
import matplotlib.pyplot as plt

plt.style.use('ggplot')

plt.figure(figsize=(20,10))
records['class'].value_counts().plot(kind='bar')
plt.tight_layout()
plt.show()
plt.close()

In [None]:
# import holoviews as hv
# hv.extension('bokeh')
# bars = hv.Bars(records['class'].value_counts(), hv.Dimension('index'), 'class').options(width=900, height=500, xrotation= 38)
# bars

Extract only the `sand dollars` annotations

In [None]:
sand_dollars = records[records['class']=="sand dollar"]

In [None]:
sand_dollars.head()

## Train, Test & Validation

Split the sampling in training ($70\%$) and testing ($30\%$) dataset

In [None]:
len(sand_dollars['filename'].unique())

In [None]:
import numpy as np
np.random.seed(0)
train_mask = np.random.rand(len(sand_dollars['filename'].unique())) < 0.7
train = list(sand_dollars['filename'].unique()[train_mask])
test = sand_dollars['filename'].unique()[~train_mask]
validation_msk = np.random.rand(len(test)) < 0.2
validation = list(test[validation_msk])
test = list(test[~validation_msk])


In [None]:
len(train), len(test), len(validation)

In [None]:
#train

## Generate new `xml` annotation files 

this new files will contain only Sand Dollars annotations

In [None]:
grouped = sand_dollars.groupby('filename')

In [None]:
sand_dollars['filename'].unique().shape

In [None]:
from utility import df2xml

In [None]:
df2xml(grouped, output_dir='results/samples/sd_annotation')

In [None]:
xml = !ls results/samples/sd_annotation | grep xml
len(xml)

In [None]:
import shutil
for i in train:
    xmlfile = i.replace('.jpg', '.xml')
    try:
        shutil.move('results/samples/'+i , 'results/samples/train/'+i)
    except:
        print('skipping :', i)
    try:
        shutil.move('results/samples/sd_annotation/'+xmlfile , 'results/samples/train/'+xmlfile)
    except:
        print('skipping :', xmlfile)

In [None]:
for i in test:
    xmlfile = i.replace('.jpg', '.xml')
    try:
        shutil.move('results/samples/'+i , 'results/samples/test/'+i)
    except:
        print('skipping :', i)
    try:
        shutil.move('results/samples/sd_annotation/'+xmlfile , 'results/samples/test/'+xmlfile)
    except:
        print('skipping :', xmlfile)

In [None]:
for i in validation:
    xmlfile = i.replace('.jpg', '.xml')
    try:
        shutil.move('results/samples/'+i , 'results/samples/validation/'+i)
    except:
        print('skipping :', i)
    try:
        shutil.move('results/samples/sd_annotation/'+xmlfile , 'results/samples/validation/'+xmlfile)
    except:
        print('skipping :', xmlfile)
        

In [None]:
!mkdir -p results/sample_extra_noSD

In [None]:
xml_tomove = !ls results/samples | grep ".xml"
img_tomove = !ls results/samples | grep ".jpg"

for i in xml_tomove:
    try:
        shutil.move('results/samples/'+i , 'results/sample_extra_noSD/'+i)
    except:
        print('skipping :', i)
        
for i in img_tomove:
    try:
        shutil.move('results/samples/'+i , 'results/sample_extra_noSD/'+i)
    except:
        print('skipping :', i)

In [None]:
import os
import zipfile

def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file))

zipf = zipfile.ZipFile('results/samples.zip', 'w', zipfile.ZIP_DEFLATED)

zipdir('results/samples/', zipf)
zipf.close()

## Creatre TF Records

In [None]:
filter = sand_dollars["filename"].isin(train)

In [None]:
train_df = sand_dollars[filter]

In [None]:
# for converting the csv/pandas dataframe into TFRecord format
# https://stackoverflow.com/questions/41402332/tensorflow-create-a-tfrecords-file-from-csv

import pandas as pd
import tensorflow as tf
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# The function takes in the all of the features of a single annotation instance as a list, 
# and then also the label as its own variable
# it creates a TFRecord (see below cell for how the format looks, similar to XML)

def create_tf_example(features, label):

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'filename': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[0].encode('utf-8')])),
        'width':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[1])])),
        'height':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[2])])),
        'class':tf.train.Feature(bytes_list=tf.train.BytesList(value=[label.encode('utf-8')])),
        'xmin':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[3])])),
        'ymin':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[4])])),
        'xmax':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[5])])),
        'ymax':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[6])])),
    }))
    return tf_example


In [None]:
# Loops through all of the rows in the pandas dataframe and individually converts each annotation instance into the TFRecord format
# note that this loop breaks after a single annotation just to provide an example. Remove the break when actually using!
# Saves/writes the output in root folder
with tf.python_io.TFRecordWriter("dataset.tfrecords") as writer:
    for index, row in train_df.iterrows():
        features = np.array(list(row[0:3].values) + list(row[4:].values))      
        # All of the features in the row, minus the class label
        label = row[3]                     
        # just the class label
        example = create_tf_example(features, label)  
        # creates a TFRecord
        writer.write(example.SerializeToString())
        # break for example, remove otherwise to run entire set
        
writer.close()

In [None]:
!gist 01_data_preparation.ipynb