preparing labels and imagery for use with tensorflow

In [1]:
# takes a set of images and labels and prepares a structure that can be fed into
# inception v3 (which can handle imagenet 2012 inputs)
# this involves
# finding the coordinates of labelled points and the corresponding image
#  cropping at most 299 x 299 pixels around each labeled point.
# storing each cropped image in directories named after the type of label
import pandas as pd
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
#training_path = '/Users/opizarro/training_data/max-woodside'
training_path = '/Volumes/LZD1601/training_data/max-woodside'

def maybe_makedir(dirname, force=False):
  if os.path.isdir(dirname) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping making dir' % (dirname))
  else:
    print('Making dir %s.' % dirname)
    os.makedirs(dirname)
  return 

maybe_makedir(training_path)

/Volumes/LZD1601/training_data/max-woodside already present - Skipping making dir


In [3]:
#survey_sheet = '/Users/opizarro/max-woodside/QN01/MSA157-40_QN01.xls'
#images_location = '/Users/opizarro/max-woodside/QN01/Stations'

def process_sheet(survey_sheet, images_location):
    df = pd.read_excel(survey_sheet)

    df.head()
#df.columns

#for each row, 
#1)make a directory using Species label (if it doesn't exist already),
#2)cut out the part of the image that corresponds to a 299x299 (at most) window around the X Y coords
#3)save the cropped inmage into the corresponding directory
    class_label_set = set(df['Species'])
    print class_label_set
    class_label_list = list(class_label_set)
    print('Number of classes %i') % len(class_label_list)

    for imclass in class_label_list:
        maybe_makedir(os.path.join(training_path, imclass))
    
    image_set = set(df['PicName'])
    suffix = '.JPG'
    halfsize = (299-1)/2
    image_list = list(image_set)
    print("Number of images %i, number of entries %i") % (len(image_list),len(df['PicName']))

    prior_imagename = ''
    
    for row in df.iterrows():
    #print(row[1])
    # read image
        imagename = (row[1].PicName)+suffix
        fullimagename = os.path.join(images_location,imagename)
        #print(fullimagename)
        if fullimagename != prior_imagename:
            # only read image if its a different one from the one we've been using
            image = cv2.imread(os.path.join(images_location,imagename))
            #print image.shape
            prior_imagename = fullimagename
        
    
        #cv2.imshow("original",image)
        #plt.figure(1)
        #plt.imshow(image)
        # read label
        imlabel = row[1].Species
        #print ('entry %s has label %s') % (imagename, imlabel)
   
   
        # find centre points
        x = row[1][9]
        y = row[1][10]
        xdim = row[1][3]
        ydim = row[1][4]
        #print('x %i, y %i, xdim %i, ydim %i') % (x,y,xdim,ydim)
        # check dimensions correpond
        if xdim != image.shape[1] or ydim !=image.shape[0]:
            print('WARNING: actual image size and size in database not consistent')
    
    
        # draw circle
        #cv2.circle(image,(x,y),11,(0,255,0),-1)
    
        # crop around centre point
        dx = min(min(x,halfsize),min(halfsize,xdim-x));
        dy = min(min(y,halfsize),min(halfsize,ydim-y));
        hs = min(dx,dy)
        # at least 81 pixels across to have some context
        if hs > 40 : 
            crop_image = image[y-hs:y+hs, x-hs:x+hs]
        
            # save cropped image in corresponding directory
            crop_name = row[1].PicName + '_' + str(x) + '_' + str(y) + '.jpg'
            fullcrop_name = os.path.join(training_path,imlabel,crop_name)
            cv2.imwrite(fullcrop_name,crop_image)
    
            if 0 and imlabel != 'Sand' and imlabel != 'Turf':
                cutstr =  ('this dot %s has label %s') % (imagename, imlabel)
                titstr =  ('x %i, y %i, xdim %i, ydim %i, hs %i') % (x,y,xdim,ydim,hs)
                plt.figure(1)
                plt.imshow(image)
                plt.title(titstr)
                plt.figure(2)
                plt.imshow(crop_image)
                plt.title(cutstr)
        
            if row[0]%1000 == 0:
                print("processing entry " + str(row[0]) + "\r")

In [7]:
#survey_sheet = '/Users/opizarro/max-woodside/QN01/MSA157-40_QN01.xls'
#images_location = '/Users/opizarro/max-woodside/QN01/Stations'
# QN02 has a different directory and file structure
# QN08 has the same images as QN09 with the names of QN09
#transects = {'QN01','QN03','QN04','QN05','QN06','QN07','QN08','QN09','QN10','QN11','QN12'}
#transects = {'QN03','QN05','QN06','QN07','QN08','QN09','QN10','QN11','QN12'}
transects = {'QN05','QN06','QN07'}
for tr in transects:
    survey_sheet = '/Volumes/LZD1601/max-woodside/' + tr + '/MSA157-40_' + tr + '.xlsx'
    images_location = '/Volumes/LZD1601/max-woodside/' + tr + '/Stations'
    print survey_sheet
    process_sheet(survey_sheet, images_location)

/Volumes/LZD1601/max-woodside/QN05/MSA157-40_QN05.xlsx
set([u'Halophila spinulosa', u'Unknown', u'Turbinaria', u'Padina', u'Upright Fauna', u'Halophila', u'Halimeda', u'Non-Coral Fauna', u'Green Alga', u'Faviid', u'RedBrown Alga', u'Seagrass', u'Flora', u'Mobile', u'Sand', u'Rock', u'Turf', u'Alga', u'Ascidian', u'Rubble', u'Soft Coral', u'Sponge', u'Porites', u'Gorgonian', u'Foraminifera'])
Number of classes 25
/Volumes/LZD1601/training_data/max-woodside/Halophila spinulosa already present - Skipping making dir
/Volumes/LZD1601/training_data/max-woodside/Unknown already present - Skipping making dir
/Volumes/LZD1601/training_data/max-woodside/Turbinaria already present - Skipping making dir
/Volumes/LZD1601/training_data/max-woodside/Padina already present - Skipping making dir
/Volumes/LZD1601/training_data/max-woodside/Upright Fauna already present - Skipping making dir
/Volumes/LZD1601/training_data/max-woodside/Halophila already present - Skipping making dir
/Volumes/LZD1601/train