In [2]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import imgaug as ia
ia.seed(1)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
from imgaug import augmenters as iaa 
from PIL import Image
import imageio
import pandas as pd
import numpy as np
import re
import os
import glob

import xml.etree.ElementTree as ET
import shutil


In [5]:
def process_folder(folder_path):
    """
    folder path notation needs to be in string format. In jupyter, tab-autocomplete to avoid confusion. Examples: './example/',
    '~/User/my/folder/is/here/'
    
    """
    #starting with a blank image list. We will then append that with each png file. 
    print(f'Building datafram from {folder_path} contents.')
   
    images = []
    for index, file in enumerate(glob.glob('./Pete_sample/*.png')):
        images.append(imageio.imread(file))

    print(f'Library contains {len(images)} images')

    xml_list = []
    for xml_file in glob.glob(f'{folder_path}*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall('object'):
            contents = (root.find('filename').text,
                     int(root.find('size')[0].text),
                     int(root.find('size')[1].text),
                     member[0].text,
                     int(member[4][0].text),
                     int(member[4][1].text),
                     int(member[4][2].text),
                     int(member[4][3].text)
                     )
            xml_list.append(contents)
            
    #lets user know how many images were processed
    print(f'Processing {len(images)} .xml files') 
    
#     sets up and df
    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    df = pd.DataFrame(xml_list, columns=column_name)
    
#     this is the target feature, change as necessary
    df['is_tent'] = (df['class']=='tent').astype(int)
    return df

In [8]:
df = process_folder('./Pete_sample/')

Building datafram from ./Pete_sample/ contents.
Library contains 4 images
Processing 4 .xml files


Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax,is_tent
0,Legoane Haiti 2010 earthq w:lat and long from ...,0,0,tent,812,829,894,897,1
1,Legoane Haiti 2010 earthq w:lat and long from ...,0,0,tent,698,802,743,841,1
2,Legoane Haiti 2010 earthq w:lat and long from ...,0,0,tent,261,590,327,660,1
3,Legoane Haiti 2010 earthq w:lat and long from ...,0,0,tent,104,203,213,267,1
4,Legoane Haiti 2010 earthq w:lat and long from ...,0,0,tent,104,276,155,324,1


In [10]:
bboxes = df[['xmin', 'ymin', 'xmax', 'ymax']]

In [18]:
classes = df['class']
filenames = df['filename']

In [19]:
bbs = np.array(bboxes)
encoder = LabelBinarizer()
classes_onehot = encoder.fit_transform( classes )

Y = np.concatenate( [ bboxes , classes_onehot ] , axis=1 )
X = np.array( filenames )

x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.1 )