# Imagenet Processing in parallel

In [23]:
%matplotlib inline
from bcolz_array_iterator import BcolzArrayIterator
from tqdm import tqdm
import gensim
import glob
import pickle
import numpy as np
from keras.utils.data_utils import get_file
import json
import keras.backend as K
import nltk
import os

Set memory usage for TF (uses all by default)

In [2]:
K.get_session().close()
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))

## Word Vectors

We'll try to do better than DeVISE and use Word2Vec trained og Google News.

In [24]:
path = '/home/mark/data/word2vec'
w2v_path = '/home/mark/data/GoogleNews-vectors-negative300'
wn_path = '/home/mark/data/wordnet'

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format(w2v_path+'.bin', binary=True)
model.save_word2vec_format(w2v_path+'.txt', binary=False)

Create .txt file for easier workflow

In [4]:
lines = open(w2v_path+'.txt').readlines(); len(lines)

3000001

In [5]:
def parse_w2v(line):
    i = line.index(' ')
    return line[:i], np.fromstring(line[i+1:-1], 'float32', sep=' ')

In [None]:
w2v_list = list(map(parse_w2v, lines[1:]))

Save processed file for quick access in the future.

In [None]:
pickle.dump(w2v_list, open(path+'/w2vl.pkl', 'wb'))

In [6]:
w2v_list = pickle.load(open(path+'/w2vl.pkl', 'rb'))

In [7]:
w2v_dict = dict(w2v_list)
words, vectors = zip(*w2v_list)

Test inputs

Should have positive correlation coeff.

In [8]:
np.corrcoef(w2v_dict['mark'], w2v_dict['Mark'])

array([[ 1.        ,  0.11157609],
       [ 0.11157609,  1.        ]])

Shouldn't have a positive correlation coeff.

In [9]:
np.corrcoef(w2v_dict['apple'], w2v_dict['Mark'])

array([[ 1.        , -0.00579598],
       [-0.00579598,  1.        ]])

We're only interested in lowercase part of word2vec

In [10]:
lowc_w2v = {w.lower(): w2v_dict[w] for w in reversed(words)}

We're going to map wordvectors for each of the 1000 categories in Imagenet to the 82,000 nouns in Wordnet.

#### 1000 Imagenet categories

In [15]:
fpath = get_file('imagenet_class_index.json',
                 'http://files.fast.ai/models/imagenet_class_index.json',
                 cache_subdir='models')

Downloading data from http://files.fast.ai/models/imagenet_class_index.json

In [59]:
class_dict = json.load(open(fpath))
n_class = len(class_dict)

In [58]:
classids_1k = dict(class_dict.values())
len(classids_1k)

1000

#### ~82,000 nouns in Wordnet

In [37]:
wordnet = nltk.corpus.wordnet.all_synsets(pos='n');
wn_nouns = list(wordnet)

In [35]:
with open(os.path.join(wn_path, 'classids.txt'), 'w') as f:
    f.writelines(['n{:08d} {}\n'.format(n.offset(), n.name().split('.')[0]) for n in wn_nouns])

In [38]:
classid_lines = open(wn_path+'/classids.txt', 'r').readlines();
classids = dict(line.strip().split(' ') for line in classid_lines)
len(classids)

82115

#### Combined wordvectors

Synset matches

51,640 / 82,115 of categories in wordnet appears in word2vec

In [64]:
# create synset wordvector from wordnet categories
syn_wv = [(k, lowc_w2v[v.lower()]) for k, v in classids.items() if v.lower() in lowc_w2v]
len(syn_wv)

51640

774 / 1000 of categories in imagenet appear in word2vec

In [65]:
# create synset wordvector from imagenet categories
syn_wv_1 = [(k, lowc_w2v[v.lower()]) for k, v in classids_1k.items() if v.lower() in lowc_w2v]
len(syn_wv_1)

774

In [66]:
syn2wb = dict(syn_wv); len(syn2wb)

51640

We'll remove the categories we can't find from the imagenet dataset

In [None]:
# TODO

In [68]:
ndim = len(list(syy.values())[0]); ndim

AttributeError: 'list' object has no attribute 'values'

## Images

In [None]:
img_path = '/home/mark/hhd/data/imagenet/'

Reading jpegs and resizing them can be slow, so we store the result.

In [None]:
fnames = list(glob.iglob(img_path+'train/*/*.JPEG'))
pickle.dump(fnames, open(img_path+'fnames.pkl', 'wb'))

In [None]:
# file = open(fpath, mode)