In [1]:
import multiprocess as mp

from glob import glob
import re
import pandas as pd
import numpy as np
import dill

import cv2

## Data import

Separate out the large (test) images from the main library for faster development work.

In [2]:
large_images = pd.read_pickle('../pkl/20_wine_label_analysis_large_labels.pkl')
large_images.shape

(4117, 5)

In [3]:
large_images.head()

Unnamed: 0,height,image_name,width,area,basename
150,1200,../images/snooth_dot_com_151.png,874,1048800,snooth_dot_com_151
176,883,../images/snooth_dot_com_177.jpeg,1466,1294478,snooth_dot_com_177
227,897,../images/snooth_dot_com_228.jpeg,1254,1124838,snooth_dot_com_228
283,1123,../images/snooth_dot_com_284.jpeg,974,1093802,snooth_dot_com_284
405,1032,../images/snooth_dot_com_406.jpeg,1260,1300320,snooth_dot_com_406


In [4]:
all_images = pd.read_pickle('../pkl/20_wine_label_analysis_all_labels.pkl')
mask = all_images['basename'].isin(large_images['basename']).pipe(np.invert)
all_images = all_images.loc[mask]
all_images.shape

(47247, 5)

## RootSIFT Features

Determine the RootSIFT features.

In [44]:
class RootSIFT(object):
    # From http://www.pyimagesearch.com/2015/04/13/implementing-rootsift-in-python-and-opencv/
    
    def __init__(self):
        # initialize the SIFT feature extractor
        self.extractor = cv2.DescriptorExtractor_create("SIFT")

    def compute(self, image, kps, eps=1e-7):
        # compute SIFT descriptors
        (kps, descs) = self.extractor.compute(image, kps)

        # if there are no keypoints or descriptors, return an empty tuple
        if len(kps) == 0:
            return ([], None)

        # apply the Hellinger kernel by first L1-normalizing and taking the
        # square-root
        descs /= (descs.sum(axis=1, keepdims=True) + eps)
        descs = np.sqrt(descs)
        #descs /= (np.linalg.norm(descs, axis=1, ord=2) + eps)

        # return a tuple of the keypoints and descriptors
        return (kps, descs)

In [None]:
def resize_image(image, height=None, width=None):
    
    (orig_height, orig_width) = image.shape[:2]
    orig_height = float(orig_height)
    orig_width = float(orig_width)

    if height is not None:
        ratio = height / orig_height
        dim = (int(orig_width * ratio), height)

    elif width is not None:
        ratio = width / orig_width
        dim = (width, int(orig_height * ratio))

    resized = cv2.resize(image, dim, interpolation=cv2.INTER_AREA)
    return resized

In [90]:
def get_sift_features(image_path_df):
    
    #len_list = list()
    sift = RootSIFT()
    feat_detector = cv2.FeatureDetector_create('SURF') # test SIFT, SURF

    #for in_path in image_path_df['image_name'].values:
    for in_path in image_path_df:
        try:
            image = cv2.imread(in_path)
            image = resize_image(image, width=320)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

            kps = feat_detector.detect(image)
            (kps, descriptors) = sift.compute(image, kps)
            kp = np.array([x.pt for x in kps])

            out_path = in_path.replace('images','sift').replace('jpeg','pkl').replace('jpg','pkl').replace('png','pkl')
            out_array = np.hstack((kp, descriptors))
            with open(out_path, 'w') as fh:
                dill.dump(descriptors, fh)

            out_path_df = out_path.replace('.pkl', '_df.pkl')
            descriptors_df = pd.DataFrame(descriptors)
            descriptors_df['kp0'] = kp[:,0]
            descriptors_df['kp1'] = kp[:,1]
            descriptors_df.to_pickle(out_path_df)
        except:
            print in_path
            
    return

In [24]:
nthreads = 48
pool = mp.Pool(processes=nthreads)
pool.map(get_sift_features, np.array_split(large_images, nthreads))
pool.close()

In [82]:
nthreads = 48
pool = mp.Pool(processes=nthreads)
pool.map(get_sift_features, np.array_split(all_images, nthreads))
pool.close()

Clean up additional missing data files.

In [83]:
existing_pkl = glob('../sift/*.pkl')
existing_pkl = np.array([x.replace('_df','') for x in existing_pkl if '_df.' in x])
output_pkl = all_images.basename.apply(lambda x: '../sift/' + x + '.pkl').values

print len(output_pkl), len(existing_pkl)

mask = np.invert(np.in1d(output_pkl, existing_pkl))
print(len(mask))

missing = output_pkl[mask]
missing = np.array([x.replace('sift','images').replace('pkl','*') for x in missing])
print(len(missing))

missing_images = [glob(x)[0] for x in missing]

47247 51019
47247
345


In [91]:
nthreads = 16
pool = mp.Pool(processes=nthreads)
pool.map(get_sift_features, np.array_split(np.array(missing_images), nthreads))
pool.close()

../images/snooth_dot_com_21815.jpeg


In [92]:
! echo "pushover 'sift finished'" | /usr/bin/zsh3

Create HDF5 data storage of feature data.

In [8]:
# st = pd.HDFStore('../data/features_large_images.h5', mode='w')
# combined_images = large_images

st = pd.HDFStore('../data/features.h5', mode='w')
combined_images = pd.concat([large_images, all_images], axis=0)

beg = 0

basename_list = list()
index_list = list()
imagepath_list = list()

for row,dat in combined_images.iterrows():
    basename = dat.basename
    
    try:
        df = pd.read_pickle('../sift/' + basename + '_df.pkl')
        end = beg+df.shape[0]

        kp = df[['kp0','kp1']]
        df = df.drop(['kp0','kp1'], axis=1)

        index = pd.Index(np.arange(beg, end))
        df = df.set_index(index)
        kp = kp.set_index(index)

        st.append('features', df, index=False)
        st.append('keypoints', kp, index=False)

        basename_list.append(basename)
        index_list.append([beg, end])
        imagepath_list.append(dat.image_name)

        beg = end
    except:
        print basename

st.append('basename', pd.Series(basename_list))
st.append('image_path', pd.Series(imagepath_list))
st.append('index', pd.DataFrame(np.array(index_list), columns=['beg','end']))

st.close()

In [7]:
! echo "pushover 'hdf5 feature database creation finished'" | /usr/bin/zsh