In [69]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import imgaug as ia
ia.seed(1)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
from imgaug import augmenters as iaa 
from PIL import Image
import imageio
import pandas as pd
import numpy as np
import re
import os
import glob
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_squared_error, f1_score
import xml.etree.ElementTree as ET
import shutil
from sklearn.ensemble import RandomForestClassifier

In [70]:
def process_folder(folder_path):
    """
    folder path notation needs to be in string format. In jupyter, tab-autocomplete to avoid confusion. Examples: './example/',
    '~/User/my/folder/is/here/'
    
    """
    #starting with a blank image list. We will then append that with each png file. 
    print(f'Building datafram from {folder_path} contents.')
   
    images = []
    for index, file in enumerate(glob.glob('./Pete_sample/*.png')):
        images.append(imageio.imread(file))

    print(f'Library contains {len(images)} images')

    xml_list = []
    for xml_file in glob.glob(f'{folder_path}*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall('object'):
            contents = (root.find('filename').text,
                     int(root.find('size')[0].text),
                     int(root.find('size')[1].text),
                     member[0].text,
                     int(member[4][0].text),
                     int(member[4][1].text),
                     int(member[4][2].text),
                     int(member[4][3].text)
                     )
            xml_list.append(contents)
            
    #lets user know how many images were processed
    print(f'Processing {len(images)} .xml files') 
    
#     sets up and df
    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    df = pd.DataFrame(xml_list, columns=column_name)
    
#     this is the target feature, change as necessary
    df['is_tent'] = (df['class']=='tent').astype(int)
    return df

In [71]:
df = process_folder('./label_img_pics/train/')

Building datafram from ./label_img_pics/train/ contents.
Library contains 4 images
Processing 4 .xml files


In [72]:
bboxes = df[['xmin', 'ymin', 'xmax', 'ymax']]

In [73]:
classes = df['class']
tent = df['is_tent']

In [78]:
bbs = np.array(bboxes)
encoder = LabelBinarizer()
classes_onehot = encoder.fit_transform( classes )

X = np.concatenate( [ bboxes ] , axis=1 )
Y = np.array(tent)

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.1 )

In [79]:
X_train[1]

array([1309,  299, 1390,  361])

In [80]:
y_train

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,

In [81]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
print(f'Score on training set: {rfc.score(X_train, y_train)}')
print(f'Score on testing set: {rfc.score(X_test, y_test)}')

rfc_preds_train = rfc.predict(X_train)
rfc_preds_test = rfc.predict(X_test)
print(f'Score on training set: {f1_score(y_train, rfc_preds_train)}')
print(f'Score on testing set: {f1_score(y_test, rfc_preds_test)}')

Score on training set: 1.0
Score on testing set: 0.8461538461538461
Score on training set: 1.0
Score on testing set: 0.8928571428571429


In [83]:
df

Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax,is_tent
0,image_1,2358,1350,tent,180,417,220,456,1
1,image_1,2358,1350,tent,264,453,306,507,1
2,image_1,2358,1350,tent,313,494,364,563,1
3,image_1,2358,1350,tent,418,500,484,556,1
4,image_1,2358,1350,tent,377,650,427,704,1
...,...,...,...,...,...,...,...,...,...
377,image_7.png,2358,1350,grass,1790,812,1856,888,0
378,image_7.png,2358,1350,grass,1566,824,1647,895,0
379,image_7.png,2358,1350,road,444,256,529,484,0
380,image_7.png,2358,1350,road,427,14,485,229,0
