# Data preparation


## File list
The directory structure of the actual annotations is quite odd.
Below some path magics to extract the filenames for both: images and xml annotations.

In [None]:
from pathlib import Path, PosixPath

In [None]:
import itertools

In [None]:
# set the path to where the annotations are

In [None]:
annotations = '/home/epinux/annotate2/'

In [None]:
jpeg_files = [str(i) for i in itertools.chain.from_iterable(
            [list(i.glob('*.jpg')) for i in itertools.chain.from_iterable(
                [sorted(i.glob('*')) for i in sorted(Path(annotations).glob('*'))])])]

In [None]:
len(jpeg_files)

In [None]:
xml_files = [str(i) for i in itertools.chain.from_iterable(
            [list(i.glob('*.xml')) for i in itertools.chain.from_iterable(
                [sorted(i.glob('*')) for i in sorted(Path(annotations).glob('*'))])])]

## Subsampling

Reduce the `xml` annotation files with a reduce factor $R_f$
Starting value:

$$
R_f=0.125 \quad \text{which will reduce the amount of files to 12.5% of the total}
$$

In [None]:
import numpy as np
reduce_factor = 0.125
np.random.seed(0)
msk = np.random.rand(len(xml_files), ) < reduce_factor
sample = list(np.array(xml_files)[msk])

In [None]:
len(sample)

In [None]:
sample[:5]

## Annotation Parsing

Parsing each `xml` file and store the resutls as `pandas.Dataframe`


In [None]:
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET

In [None]:
def xml_to_csv(xml_files):
    xml_list = []
    for xml_file in xml_files:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall('object'):
            value = (root.find('filename').text,
                     int(root.find('size')[0].text),
                     int(root.find('size')[1].text),
                     member[0].text,
                     int(member[4][0].text),
                     int(member[4][1].text),
                     int(member[4][2].text),
                     int(member[4][3].text)
                     )
            xml_list.append(value)
    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    xml_df = pd.DataFrame(xml_list, columns=column_name)
    return xml_df

In [None]:
records = xml_to_csv(sample)

Get an idea of which labels are in all the annotations

In [None]:
list(records['class'].unique())

In [None]:
records.head()

In [None]:
records.describe()

In [None]:
%matplotlib inline

Simple statistic description of the sample

In [None]:
import matplotlib.pyplot as plt

plt.style.use('ggplot')

plt.figure(figsize=(20,10))
records['class'].value_counts().plot(kind='bar')
plt.tight_layout()
plt.show()
plt.close()

In [None]:
# import holoviews as hv
# hv.extension('bokeh')
# bars = hv.Bars(records['class'].value_counts(), hv.Dimension('index'), 'class').options(width=900, height=500, xrotation= 38)
# bars

Extract only the `sand dollars` annotations

In [None]:
records = records[records['class']=="sand dollar"]

## TensorFlow records


In [None]:
# for converting the csv/pandas dataframe into TFRecord format
# https://stackoverflow.com/questions/41402332/tensorflow-create-a-tfrecords-file-from-csv

import pandas as pd
import tensorflow as tf
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# The function takes in the all of the features of a single annotation instance as a list, and then also the label as its own variable
# it creates a TFRecord (see below cell for how the format looks, similar to XML)

def create_tf_example(features, label):

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'filename': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[0].encode('utf-8')])),
        'width':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[1])])),
        'height':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[2])])),
        'class':tf.train.Feature(bytes_list=tf.train.BytesList(value=[label.encode('utf-8')])),
        'xmin':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[3])])),
        'ymin':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[4])])),
        'xmax':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[5])])),
        'ymax':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[6])])),
    }))
    return tf_example


In [None]:
# Loops through all of the rows in the pandas dataframe and individually converts each annotation instance into the TFRecord format
# note that this loop breaks after a single annotation just to provide an example. Remove the break when actually using!
# Saves/writes the output in root folder
with tf.python_io.TFRecordWriter("dataset.tfrecords") as writer:
    for index, row in records.iterrows():
        features = np.array(list(row[0:3].values) + list(row[4:].values))      
        # All of the features in the row, minus the class label
        label = row[3]                     
        # just the class label
        example = create_tf_example(features, label)  
        # creates a TFRecord
        writer.write(example.SerializeToString())
        # break for example, remove otherwise to run entire set
        
writer.close()

In [None]:
example

## Train & Test

Split the sampling in training ($70\%$) and testing ($30\%$) dataset

In [None]:
import numpy as np
msk = np.random.rand(len(records)) < 0.7
train = records[msk]
test = records[~msk]

In [None]:
train.shape

In [None]:
test.shape

In [None]:
test.to_csv('test.csv', index=False)
train.to_csv('train.csv', index=False)

In [None]:
!gist test.csv

In [None]:
!gist train.csv

In [None]:
!gist 01_data_preparation.ipynb