In [4]:
import numpy as np
import argparse
from path import Path

from keras.models import Model
from keras.layers import Dense, Dropout
from keras.preprocessing.image import load_img, img_to_array
import matplotlib.pyplot as plt
import tensorflow as tf
import glob
import subprocess
import pickle
import PIL as p
from multiprocessing import Pool
import warnings

from utils.nasnet import NASNetMobile, preprocess_input
from utils.score_utils import mean_score, std_score

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

doms = glob.glob("/nn-images/images_split_by_domain/domains/*")

In [5]:
ims = pickle.load(open('image_order.pickle', "rb"))
img_dct = pickle.load(open('image_dct.picke', "rb"))

In [6]:
import time
from multiprocessing import Pool

def image_checker(impath):
    try:
        img = load_img(impath, target_size=RESHAPE_SIZE)
    except OSError:
        return impath, None

    try:
        x = img_to_array(img)
    except TypeError:
        return impath, None
    
    return impath, x

def filtered_data_generator(batch_size, paths):
    batch_paths = []
    batch = []
    with Pool(15) as pool:
        for imcheck in pool.imap_unordered(image_checker, paths):
            if imcheck[1] is None:
                logger.warning(f'Failed validation for {imcheck[0]}. skipping.')
                continue

            batch.append(imcheck[1])
            batch_paths.append(imcheck[0])
            
            if len(batch) == batch_size:
                data_batch = preprocess_input(np.stack(batch))
                yield data_batch, batch_paths
                
                batch_paths = []
                batch = []
        
        data_batch = preprocess_input(np.stack(batch))
        yield data_batch, batch_paths

def nn_data_generator(batch_size):
    all_paths = []
    for idx, data_batch_paths in enumerate(filtered_data_generator(batch_size, INPUT_PATHS)):
        data_batch, paths = data_batch_paths
        logger.info(f'produced batch {idx+1}')
        all_paths += paths

        with open('/nn-images/filenames/all_paths.pkl', 'wb') as f:
            pickle.dump(all_paths, f)
        
        yield data_batch

In [7]:
with tf.device('/GPU:0'):
    logger.info('building nn')
    base_model = NASNetMobile((224, 224, 3), include_top=False, pooling='avg', weights=None)
    x = Dropout(0.75)(base_model.output)
    x = Dense(10, activation='softmax')(x)

    model = Model(base_model.input, x)
    model.load_weights('weights/nasnet_weights.h5')

2018-08-30 18:24:46,924 - root - INFO - building nn


In [None]:
gap = 10000
for startidx in range(0, 800000, gap):
    endidx = startidx+gap
    INPUT_PATHS = ims[startidx:endidx]
    RESHAPE_SIZE = (224, 224)

    logger.info(f'running images through nn ({startidx}-{endidx})')

    if len(INPUT_PATHS) == gap:
        gen_count = int(gap/1000)
    else:
        gen_count = int(gap/1000) + 1

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        scores = model.predict_generator(nn_data_generator(1000), gen_count, verbose=2, workers=0)

    with open('/nn-images/filenames/all_paths.pkl', 'rb') as f:
        scored_paths = pickle.load(f)

    assert scores.shape[0] == len(scored_paths)

    with open(f'/nn-images/image_scores/scores.{endidx:06}.pkl', 'wb') as f:
        pickle.dump((scores, scored_paths), f)

2018-08-30 18:29:54,382 - root - INFO - running images through nn (0-10000)
2018-08-30 18:30:07,721 - root - INFO - produced batch 1
2018-08-30 18:30:23,914 - root - INFO - produced batch 2
2018-08-30 18:30:28,295 - root - INFO - produced batch 3
2018-08-30 18:30:32,714 - root - INFO - produced batch 4
2018-08-30 18:30:37,199 - root - INFO - produced batch 5
2018-08-30 18:30:46,550 - root - INFO - produced batch 6
2018-08-30 18:30:53,163 - root - INFO - produced batch 7
2018-08-30 18:30:59,912 - root - INFO - produced batch 8
2018-08-30 18:31:05,855 - root - INFO - produced batch 9
2018-08-30 18:31:10,980 - root - INFO - produced batch 10
2018-08-30 18:31:14,943 - root - INFO - running images through nn (10000-20000)
2018-08-30 18:31:21,261 - root - INFO - produced batch 1
2018-08-30 18:31:25,700 - root - INFO - produced batch 2
2018-08-30 18:31:30,078 - root - INFO - produced batch 3
2018-08-30 18:31:42,568 - root - INFO - produced batch 4
2018-08-30 18:31:56,522 - root - INFO - produ

2018-08-30 18:45:49,378 - root - INFO - produced batch 6
2018-08-30 18:45:58,554 - root - INFO - produced batch 7
2018-08-30 18:46:05,538 - root - INFO - produced batch 8
2018-08-30 18:46:13,088 - root - INFO - produced batch 9
2018-08-30 18:46:20,401 - root - INFO - produced batch 10
2018-08-30 18:46:26,377 - root - INFO - running images through nn (130000-140000)
2018-08-30 18:46:53,985 - root - INFO - produced batch 1
2018-08-30 18:47:04,498 - root - INFO - produced batch 2
2018-08-30 18:47:14,451 - root - INFO - produced batch 3
2018-08-30 18:47:20,175 - root - INFO - produced batch 4
2018-08-30 18:47:26,150 - root - INFO - produced batch 5
2018-08-30 18:47:31,958 - root - INFO - produced batch 6
2018-08-30 18:47:37,826 - root - INFO - produced batch 7
2018-08-30 18:47:46,427 - root - INFO - produced batch 8
2018-08-30 18:47:50,918 - root - INFO - produced batch 9
2018-08-30 18:48:01,619 - root - INFO - produced batch 10
2018-08-30 18:48:07,817 - root - INFO - running images throug

2018-08-30 18:57:51,395 - root - INFO - produced batch 7
2018-08-30 18:57:55,780 - root - INFO - produced batch 8
2018-08-30 18:58:06,370 - root - INFO - produced batch 9
2018-08-30 18:58:15,584 - root - INFO - produced batch 10
2018-08-30 18:58:19,791 - root - INFO - running images through nn (220000-230000)
2018-08-30 18:58:25,578 - root - INFO - produced batch 1
2018-08-30 18:58:33,923 - root - INFO - produced batch 2
2018-08-30 18:58:46,514 - root - INFO - produced batch 3
2018-08-30 18:58:53,646 - root - INFO - produced batch 4
2018-08-30 18:59:01,318 - root - INFO - produced batch 5
2018-08-30 18:59:05,877 - root - INFO - produced batch 6
2018-08-30 18:59:22,843 - root - INFO - produced batch 7
2018-08-30 18:59:34,574 - root - INFO - produced batch 8
2018-08-30 18:59:38,935 - root - INFO - produced batch 9
2018-08-30 18:59:45,567 - root - INFO - produced batch 10
2018-08-30 18:59:49,719 - root - INFO - running images through nn (230000-240000)
2018-08-30 18:59:55,679 - root - INF

2018-08-30 19:15:17,257 - root - INFO - produced batch 10
2018-08-30 19:15:21,290 - root - INFO - running images through nn (340000-350000)
2018-08-30 19:15:26,138 - root - INFO - produced batch 1
2018-08-30 19:15:30,751 - root - INFO - produced batch 2
2018-08-30 19:15:40,392 - root - INFO - produced batch 3
2018-08-30 19:15:45,000 - root - INFO - produced batch 4
2018-08-30 19:15:49,430 - root - INFO - produced batch 5
2018-08-30 19:15:53,790 - root - INFO - produced batch 6
2018-08-30 19:15:58,138 - root - INFO - produced batch 7
2018-08-30 19:16:02,662 - root - INFO - produced batch 8
2018-08-30 19:16:07,566 - root - INFO - produced batch 9
2018-08-30 19:16:16,134 - root - INFO - produced batch 10
2018-08-30 19:16:20,264 - root - INFO - running images through nn (350000-360000)
2018-08-30 19:16:33,408 - root - INFO - produced batch 1
2018-08-30 19:16:38,623 - root - INFO - produced batch 2
2018-08-30 19:16:44,521 - root - INFO - produced batch 3
2018-08-30 19:16:54,490 - root - INF

2018-08-30 19:31:00,302 - root - INFO - produced batch 2
2018-08-30 19:31:21,249 - root - INFO - produced batch 3
2018-08-30 19:31:40,346 - root - INFO - produced batch 4
2018-08-30 19:31:47,970 - root - INFO - produced batch 5
2018-08-30 19:31:52,422 - root - INFO - produced batch 6
2018-08-30 19:32:01,614 - root - INFO - produced batch 7
2018-08-30 19:32:13,435 - root - INFO - produced batch 8
2018-08-30 19:32:17,836 - root - INFO - produced batch 9
2018-08-30 19:32:22,149 - root - INFO - produced batch 10
2018-08-30 19:32:26,259 - root - INFO - running images through nn (470000-480000)
2018-08-30 19:32:43,737 - root - INFO - produced batch 1
2018-08-30 19:32:49,512 - root - INFO - produced batch 2
2018-08-30 19:32:58,943 - root - INFO - produced batch 3
2018-08-30 19:33:20,349 - root - INFO - produced batch 4
2018-08-30 19:33:24,912 - root - INFO - produced batch 5
2018-08-30 19:33:30,758 - root - INFO - produced batch 6
2018-08-30 19:33:49,107 - root - INFO - produced batch 7
2018-

2018-08-30 19:48:09,479 - root - INFO - produced batch 2
2018-08-30 19:48:13,854 - root - INFO - produced batch 3
2018-08-30 19:48:30,493 - root - INFO - produced batch 4
2018-08-30 19:48:36,188 - root - INFO - produced batch 5
2018-08-30 19:48:46,524 - root - INFO - produced batch 6
2018-08-30 19:48:55,228 - root - INFO - produced batch 7
2018-08-30 19:49:08,398 - root - INFO - produced batch 8
2018-08-30 19:49:13,415 - root - INFO - produced batch 9
2018-08-30 19:49:19,236 - root - INFO - produced batch 10
2018-08-30 19:49:23,339 - root - INFO - running images through nn (590000-600000)
2018-08-30 19:49:32,253 - root - INFO - produced batch 1
2018-08-30 19:49:38,484 - root - INFO - produced batch 2
2018-08-30 19:49:43,229 - root - INFO - produced batch 3
2018-08-30 19:49:54,185 - root - INFO - produced batch 4
2018-08-30 19:50:10,669 - root - INFO - produced batch 5
2018-08-30 19:50:17,314 - root - INFO - produced batch 6
2018-08-30 19:50:22,184 - root - INFO - produced batch 7
2018-

In [None]:
# validation_paths = glob.glob('data/*')
# with open('/nn-images/filenames/all_paths.pkl', 'rb') as f:
#     t = pickle.load(f)

# mscores = [mean_score(score) for score in scores]
# imnames = [p.split('/')[1][:-4] for p in validation_paths]
# score_dict = {n: s for s, n in zip(mscores, imnames)}

In [None]:
# with open('/nn-images/filenames/all_paths.pkl', 'rb') as f:
#     scored_paths = pickle.load(f)
    
# print(scores.shape, len(scored_paths))
# assert scores.shape[0] == len(scored_paths)

# with open('/nn-images/image_scores/scores.100.pkl', 'wb') as f:
#     pickle.dump((scores, scored_paths), f)

In [14]:
# last batch
#
# gap = 1000
# startidx = 807000
# endidx = len(ims)
# INPUT_PATHS = ims[startidx:endidx]
# RESHAPE_SIZE = (224, 224)

# logger.info(f'running images through nn ({startidx}-{endidx})')

# with warnings.catch_warnings():
#     warnings.simplefilter("ignore")
#     scores = model.predict_generator(nn_data_generator(1000), 1, verbose=2, workers=0)

# with open('/nn-images/filenames/all_paths.pkl', 'rb') as f:
#     scored_paths = pickle.load(f)

# assert scores.shape[0] == len(scored_paths)

# with open(f'/nn-images/image_scores/scores.{endidx:06}.pkl', 'wb') as f:
#     pickle.dump((scores, scored_paths), f)

2018-08-30 21:35:45,153 - root - INFO - running images through nn (807000-807343)
2018-08-30 21:35:48,417 - root - INFO - produced batch 1
