In [2]:
import multiprocess as mp

from glob import glob
import re
import pandas as pd
import numpy as np
import dill

import cv2
from image_features import feature_detect_extract

## Data import

Separate out the large (test) images from the main library for faster development work.

In [3]:
large_images = pd.read_pickle('../priv/pkl/20_wine_label_analysis_large_labels.pkl')
large_images.shape

(4117, 5)

In [5]:
all_images = pd.read_pickle('../priv/pkl/20_wine_label_analysis_all_labels.pkl')
mask = all_images['basename'].isin(large_images['basename']).pipe(np.invert)
all_images = all_images.loc[mask]
all_images.shape

(47247, 5)

## Get image features

Determine features using SIFT.

In [6]:
def get_sift_features(image_path_df):
    
    #len_list = list()
    feat_detector = cv2.FeatureDetector_create('SIFT') # test SIFT, SURF
    feat_extractor = cv2.DescriptorExtractor_create("SIFT")

    for in_path in image_path_df['image_name'].values:
    #for in_path in image_path_df:
        try:
            kps, descs = feature_detect_extract(in_path, feat_detector, feat_extractor)
            kp = np.array([x.pt for x in kps])

            out_path = in_path.replace('images','sift').replace('jpeg','pkl').replace('jpg','pkl').replace('png','pkl')
            out_array = np.hstack((kp, descs))
            with open(out_path, 'w') as fh:
                dill.dump(descs, fh)

            out_path_df = out_path.replace('.pkl', '_df.pkl')
            descriptors_df = pd.DataFrame(descs)
            descriptors_df['kp0'] = kp[:,0]
            descriptors_df['kp1'] = kp[:,1]
            descriptors_df.to_pickle(out_path_df)
        except:
            print in_path
            
    return

In [7]:
nthreads = 48
pool = mp.Pool(processes=nthreads)
pool.map(get_sift_features, np.array_split(large_images, nthreads))
pool.close()

In [82]:
nthreads = 48
pool = mp.Pool(processes=nthreads)
pool.map(get_sift_features, np.array_split(all_images, nthreads))
pool.close()

Clean up additional missing data files.

In [83]:
existing_pkl = glob('../priv/sift/*.pkl')
existing_pkl = np.array([x.replace('_df','') for x in existing_pkl if '_df.' in x])
output_pkl = all_images.basename.apply(lambda x: '../sift/' + x + '.pkl').values

print len(output_pkl), len(existing_pkl)

mask = np.invert(np.in1d(output_pkl, existing_pkl))
print(len(mask))

missing = output_pkl[mask]
missing = np.array([x.replace('sift','images').replace('pkl','*') for x in missing])
print(len(missing))

missing_images = [glob(x)[0] for x in missing]

47247 51019
47247
345


In [91]:
nthreads = 16
pool = mp.Pool(processes=nthreads)
pool.map(get_sift_features, np.array_split(np.array(missing_images), nthreads))
pool.close()

../images/snooth_dot_com_21815.jpeg


In [92]:
! echo "pushover 'sift finished'" | /usr/bin/zsh3

Create HDF5 data storage of feature data.

In [8]:
# st = pd.HDFStore('../priv/data/features_large_images.h5', mode='w')
# combined_images = large_images

st = pd.HDFStore('../priv/data/features.h5', mode='w')
combined_images = pd.concat([large_images, all_images], axis=0)

beg = 0

basename_list = list()
index_list = list()
imagepath_list = list()

for row,dat in combined_images.iterrows():
    basename = dat.basename
    
    try:
        df = pd.read_pickle('../priv/sift/' + basename + '_df.pkl')
        end = beg+df.shape[0]

        kp = df[['kp0','kp1']]
        df = df.drop(['kp0','kp1'], axis=1)

        index = pd.Index(np.arange(beg, end))
        df = df.set_index(index)
        kp = kp.set_index(index)

        st.append('features', df, index=False)
        st.append('keypoints', kp, index=False)

        basename_list.append(basename)
        index_list.append([beg, end])
        imagepath_list.append(dat.image_name)

        beg = end
    except:
        print basename

st.append('basename', pd.Series(basename_list))
st.append('image_path', pd.Series(imagepath_list))
st.append('index', pd.DataFrame(np.array(index_list), columns=['beg','end']))

st.close()

In [7]:
! echo "pushover 'hdf5 feature database creation finished'" | /usr/bin/zsh