In [41]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
import imgaug as ia
ia.seed(1)
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
%matplotlib inline
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
from imgaug import augmenters as iaa 
from PIL import Image
import imageio
import pandas as pd
import numpy as np
import re
import os
import glob

import xml.etree.ElementTree as ET
import shutil

Using TensorFlow backend.


In [42]:
def process_folder(folder_path):
    """
    folder path notation needs to be in string format. In jupyter, tab-autocomplete to avoid confusion. Examples: './example/',
    '~/User/dsi/group_project/tent_satellite_imagery/tents/train/images'
    
    """
    #starting with a blank image list. We will then append that with each png file. 
    print(f'Building dataframe from {folder_path} contents.')
   
    images = []
    for index, file in enumerate(glob.glob('./train/*.png')):
        images.append(imageio.imread(file))

    print(f'Library contains {len(images)} images')

    xml_list = []
    for xml_file in glob.glob(f'{folder_path}*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall('object'):
            contents = (root.find('filename').text,
                     int(root.find('size')[0].text),
                     int(root.find('size')[1].text),
                     member[0].text,
                     int(member[4][0].text),
                     int(member[4][1].text),
                     int(member[4][2].text),
                     int(member[4][3].text)
                     )
            xml_list.append(contents)
            
    #lets user know how many images were processed
    print(f'Processing {len(images)} .xml files') 
    
#     sets up and df
    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    df = pd.DataFrame(xml_list, columns=column_name)
    
#     this is the target feature, change as necessary
    df['is_tent'] = (df['class']== 1)
    return df


In [43]:
df = process_folder('./train/')

Building dataframe from ./train/ contents.
Library contains 2 images
Processing 2 .xml files


In [44]:
bboxes = df[['xmin', 'ymin', 'xmax', 'ymax']]

In [45]:
classes = df['class']
filenames = df['filename']

In [54]:
bbs = np.array(bboxes)
encoder = LabelBinarizer()
classes_onehot = encoder.fit_transform( classes )

Y = np.concatenate( [ bboxes , classes_onehot ] , axis=1 )
X = np.array( filenames )

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.1 )