preparing labels and imagery for use with tensorflow

In [9]:
# takes a set of images and labels and prepares a structure that can be fed into
# inception v3 (which can handle imagenet 2012 inputs)
# this involves
# finding the coordinates of labelled points and the corresponding image
#  cropping at most 299 x 299 pixels around each labeled point.
# storing each cropped image in directories named after the type of label
import pandas as pd
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import string
import glob

%matplotlib inline



In [10]:


#training_path = '/Users/opizarro/training_data/max-woodside'
training_path = '/Volumes/LZD1601/training_data/benthoz'

def maybe_makedir(dirname, force=False):
  if os.path.isdir(dirname) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping making dir' % (dirname))
  else:
    print('Making dir %s.' % dirname)
    os.makedirs(dirname)
  return 

maybe_makedir(training_path)




/Volumes/LZD1601/training_data/benthoz already present - Skipping making dir


In [11]:
survey_sheet = '/Volumes/LZD1601/benthoz2015_flat/annotation-Western Australia 2013-rawdbdump-2016-06-01.csv'

In [12]:
df = pd.read_csv(survey_sheet)
df.columns
df.head()

Unnamed: 0.1,Unnamed: 0,image__id,code,web_location,label_id,modifiers,name,y,x,caab
0,0,2504879,MA,WA201304/r20130405_072727_coralpatches_01_15m_...,39,,Macroalgae,0.289112,0.560769,80300000
1,1,2504879,MA,WA201304/r20130405_072727_coralpatches_01_15m_...,39,,Macroalgae,0.663061,0.806883,80300000
2,2,2504879,MA,WA201304/r20130405_072727_coralpatches_01_15m_...,39,,Macroalgae,0.613801,0.856499,80300000
3,3,2504879,MA,WA201304/r20130405_072727_coralpatches_01_15m_...,39,,Macroalgae,0.954063,0.20704,80300000
4,4,2504879,MALCB,WA201304/r20130405_072727_coralpatches_01_15m_...,45,,Macroalgae: Large canopy-forming: Brown,0.975761,0.073148,80300902


In [13]:
#survey_sheet = '/Users/opizarro/max-woodside/QN01/MSA157-40_QN01.xls'
#images_location = '/Users/opizarro/max-woodside/QN01/Stations'

def process_sheet(survey_sheet, images_location):
    #df = pd.read_excel(survey_sheet)
    df = pd.read_csv(survey_sheet)
    df.head()
    
#df.columns

#for each row, 
#1)make a directory using Species label (if it doesn't exist already),
#2)cut out the part of the image that corresponds to a 299x299 (at most) window around the X Y coords
#3)save the cropped inmage into the corresponding directory
    class_label_set = set(df['code'])
    print class_label_set
    class_label_list = list(class_label_set)
    print('Number of classes %i') % len(class_label_list)
    print('Testing...')
    for imclass in class_label_list:
        #imclasses = string.split(imclass,':')
        #imclass_short = imclasses[0]
        #print('*** imclass %s shortened to %s') % (imclass, imclass_short)
        print('*** imclass %s') % imclass
        maybe_makedir(os.path.join(training_path, imclass))
    
    image_set = set(df['web_location'])
    xdim = 1360
    ydim = 1024
    halfsize = (299-1)/2
    image_list = list(image_set)
    print("Number of images %i, number of entries %i") % (len(image_list),len(df['web_location']))

    prior_imagename = ''
    
    for row in df.iterrows():
    #print(row[1])
    # read image
        imagename = os.path.basename(row[1].web_location)
        fullimagename = os.path.join(images_location,imagename)
        if os.path.isfile(fullimagename):
            #print('reading %s') % fullimagename
            if fullimagename != prior_imagename:
                # only read image if its a different one from the one we've been using
                image = cv2.imread(os.path.join(images_location,imagename))
                #print('image shape %i %i %i') % image.shape
                prior_imagename = fullimagename
        
    
            #cv2.imshow("original",image)
            #plt.figure(1)
            #plt.imshow(image)
            # read label
            #print('row label %s') % row[1].code
            #imlabels = string.split(row[1].name,':')
            #imlabel = imlabels[0]
            imlabel = row[1].code
            #print ('entry %s has label %s') % (imagename, imlabel)
   
   
            # find centre points
            x = round(row[1].x*xdim)
            y = round(row[1].y*ydim)
        
         
            #print('x %i, y %i, xdim %i, ydim %i') % (x,y,xdim,ydim)
            # check dimensions correpond
            if xdim != image.shape[1] or ydim !=image.shape[0]:
                print('WARNING: actual image size and size in database not consistent')
    
    
            # draw circle
            #cv2.circle(image,(x,y),11,(0,255,0),-1)
    
            # crop around centre point
            dx = min(min(x,halfsize),min(halfsize,xdim-x));
            dy = min(min(y,halfsize),min(halfsize,ydim-y));
            hs = min(dx,dy)
            # at least 81 pixels across to have some context
            if hs > 40 : 
                crop_image = image[y-hs:y+hs, x-hs:x+hs]
        
                # save cropped image in corresponding directory
                crop_name = imagename + '_' + str(x) + '_' + str(y) + '.jpg'
                fullcrop_name = os.path.join(training_path,imlabel,crop_name)
                cv2.imwrite(fullcrop_name,crop_image)
    
                if 0 and imlabel != 'Sand' and imlabel != 'Turf':
                    cutstr =  ('this dot %s has label %s') % (imagename, imlabel)
                    titstr =  ('x %i, y %i, xdim %i, ydim %i, hs %i') % (x,y,xdim,ydim,hs)
                    plt.figure(1)
                    plt.imshow(image)
                    plt.title(titstr)
                    plt.figure(2)
                    plt.imshow(crop_image)
                    plt.title(cutstr)
        
                if row[0]%100 == 0:
                    print("processing entry " + str(row[0]) + "\r")
        else:
            print('**** WARNING: could not find image %s') % fullimagename
            
    
    


In [16]:
# each campaign folder in benthoz2015 has a cvs file and the related images

base_location = '/Volumes/LZD1601/benthoz2015'

campaign_list=os.listdir(base_location)
for campaign in campaign_list:
    full_campaign=os.path.join(base_location,campaign)
    if os.path.isdir(full_campaign):
        image_sheet=glob.glob(os.path.join(full_campaign,'*.csv'))
        print os.path.join(full_campaign,'*.csv')
        if len(image_sheet)>0:
            process_sheet(image_sheet, campaign)
        else:
            print('no csv found in %s') % campaign
    else:
        print('%s is not a folder') % campaign
        

    
   

BENTHOZ-2015-imagelist.csv is not a folder
._BENTHOZ-2015-imagelist.csv is not a folder
/Volumes/LZD1601/benthoz2015/PS201012/*.csv
no csv found in PS201012
/Volumes/LZD1601/benthoz2015/Batemans201011/*.csv
no csv found in Batemans201011
/Volumes/LZD1601/benthoz2015/SolitaryIs201208/*.csv
no csv found in SolitaryIs201208
/Volumes/LZD1601/benthoz2015/PS201211/*.csv
no csv found in PS201211
/Volumes/LZD1601/benthoz2015/Batemans201211/*.csv
no csv found in Batemans201211
/Volumes/LZD1601/benthoz2015/SEQueensland201010/*.csv
no csv found in SEQueensland201010
/Volumes/LZD1601/benthoz2015/Tasmania200810/*.csv
no csv found in Tasmania200810
/Volumes/LZD1601/benthoz2015/Tasmania200903/*.csv
no csv found in Tasmania200903
/Volumes/LZD1601/benthoz2015/Tasmania200906/*.csv
no csv found in Tasmania200906
/Volumes/LZD1601/benthoz2015/WA201104/*.csv
no csv found in WA201104
/Volumes/LZD1601/benthoz2015/WA201204/*.csv
no csv found in WA201204
/Volumes/LZD1601/benthoz2015/WA201304/*.csv
no csv found 