In [None]:
#machine learning script, basing on VGG16 
import os
from os.path import join
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import ImageDataGenerator, load_img
from keras.layers.core import Dense
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from keras.utils.data_utils import get_file
import json, copy
from keras.models import Model
from keras.optimizers import Adam
from keras import backend
from random import shuffle
import csv

In [None]:
data_path = join(os.getcwd(), *['data', 'playground'])

In [None]:
class BinaryVgg16():
    def __init__(self, data_path, target_size=(224, 224)):
        self.data_path = data_path
        self.train_path = join(data_path, 'train')
        self.valid_path = join(data_path, 'valid')
        self.test_path = join(data_path, 'test')
        
        self.target_size = target_size
        
        self.model = VGG16(weights = 'imagenet')
        self.prepare_binary_vgg16()
        
        self.generator = ImageDataGenerator()
        
        self.sample_images = None
        
    def prepare_binary_vgg16(self):
        model = VGG16(weights='imagenet')
        model.layers.pop()
        for layer in model.layers: layer.trainable=False
        new_top = Dense(1, activation='sigmoid')(model.layers[-1].output)
        new_model = Model(model.input, new_top)
        new_model.compile(optimizer=Adam(lr=0.1),loss='binary_crossentropy', metrics=['accuracy'])
        self.new_model = new_model
    
    def generate_images(self, path, shuffle, class_mode, batch_size):
        return self.generator.flow_from_directory(path, shuffle = shuffle, class_mode = class_mode, batch_size = batch_size, target_size = self.target_size)
    
        
    def show_validation_images(self, img_nbr=4):
        self.prepare_sample_images(img_nbr)
        plots(self.sample_images, titles=self.sample_titles, figtitle='Validation images with original class')
        
    def prepare_sample_images(self, imgs_num=4):
        if self.sample_images is not None and len(self.sample_images) is imgs_num:
            return
        generator  = self.generate_images(self.valid_path, shuffle=True, class_mode='binary', batch_size=imgs_num)
        self.sample_images, self.sample_titles = next(generator)
        
    def show_sample_predictions_by_original_model(self, img_nbr=4):
        self.prepare_sample_images(img_nbr)
        predictions = self.model.predict(self.sample_images)
        plot_with_most_probable_classes(self.sample_images, predictions, get_classes(), 3, 'Sample predictions by original model')
        
    def show_sample_predictions(self, img_nbr=4):
        self.prepare_sample_images(img_nbr)
        predictions = self.new_model.predict(self.sample_images)
        plots(self.sample_images, titles = [str(p[0]) for p in predictions], figtitle='Predictions by new model')
        
    def fit(self, batch_size = 10, epochs = 1, learning_rate = 0.01):
        train_batches = self.generate_images(self.train_path, shuffle=True, class_mode='binary', batch_size=batch_size)
        valid_batches = self.generate_images(self.valid_path, shuffle=True, class_mode='binary', batch_size=batch_size)
        self.new_model.optimizer.lr = learning_rate
        self.new_model.fit_generator(train_batches,
                             steps_per_epoch=train_batches.samples/batch_size,
                             validation_data  = valid_batches,
                             validation_steps = valid_batches.samples/batch_size,
                              nb_epoch = epochs
                            )

    def save(self, filepath):
        self.new_model.save_weights(filepath)
        
    def load(self, filepath):
        self.new_model.load_weights(filepath)
        
    def save_test_predictions(self):
        batch_size = 1
        generator = self.generate_images(self.test_path, shuffle = False, batch_size = batch_size, class_mode=None)
        batch_results = self.new_model.predict_generator(generator, steps=generator.samples//batch_size)
        results = [elem for result in batch_results for elem in result]
        image_ids = [f.split('/')[-1].replace('.jpg','') for f in generator.filenames]
        save_results(list(zip(image_ids, trim_predictions(results, 0.025))))
        
    def save_interesting_cases(self, imgs_nbr=4):
    
        batch_size = 4
        gen = self.generate_images(self.valid_path,  batch_size = batch_size, shuffle = False, class_mode=None)
        predictions = self.new_model.predict_generator(gen, steps = gen.samples//batch_size)
        predictions = [p[0] for p in predictions]
        file_label_pred = list(zip(gen.filenames, gen.classes, predictions))
    
        self.interesting_imgs_map = {}
    
        for label in gen.class_indices.values():
            self.interesting_imgs_map[label] = {}
            f_l_p_label = [elem for elem in file_label_pred if label == elem[1]]

            confusing = copy.copy(f_l_p_label)
            confusing.sort(key=lambda elem: np.abs(elem[2] - 0.5))
            self.interesting_imgs_map[label]['confusing'] = confusing[:imgs_nbr]
        
            correct  = [elem for elem in f_l_p_label if np.abs(elem[1] - elem[2]) < 0.5]
            shuffle(correct)
            self.interesting_imgs_map[label]['correct'] = correct[:imgs_nbr]
            correct.sort(key=lambda tpl: np.abs(tpl[1] - tpl[2]))
            self.interesting_imgs_map[label]['most_correct'] = correct[:imgs_nbr]
        
            incorrect = [elem for elem in f_l_p_label if np.abs(elem[1] - elem[2]) >= 0.5]
            shuffle(incorrect)
            self.interesting_imgs_map[label]['incorrect'] = incorrect[:imgs_nbr]
            incorrect.sort(key=lambda tpl: -np.abs(tpl[1] - tpl[2]))
            self.interesting_imgs_map[label]['most_incorrect']=incorrect[:imgs_nbr]
        
    def plot_interesting_cases(self):
        for label in self.interesting_imgs_map.keys():
            for category in self.interesting_imgs_map[label].keys():
                if self.interesting_imgs_map[label][category]:
                    plots([load_img(join(self.valid_path, elem[0])) for elem in self.interesting_imgs_map[label][category]],
                         titles = ['orig: ' + str(elem[1]) + '\npred: ' + str(elem[2]) for elem in self.interesting_imgs_map[label][category]],
                         figtitle = 'class: ' + str(label) + '\ncategory: '+ category)

In [None]:
def plots(ims, figsize=(12,6), rows=1, interp=False, titles=None, figtitle=None):
    if type(ims[0]) is np.ndarray:
        ims = np.array(ims).astype(np.uint8)
        if (ims.shape[-1] != 3):
            ims = ims.transpose((0,2,3,1))
    f = plt.figure(figsize=figsize)
    f.suptitle(figtitle)
    cols = len(ims)//rows if len(ims) % 2 == 0 else len(ims)//rows + 1
    for i in range(len(ims)):
        sp = f.add_subplot(rows, cols, i+1)
        sp.axis('Off')
        if titles is not None:
            sp.set_title(titles[i], fontsize=12)
        plt.imshow(ims[i], interpolation=None if interp else 'none')

def get_classes():
        fname = 'imagenet_class_index.json'
        fpath = get_file(fname, join('http://files.fast.ai/models/', fname), cache_subdir='models')
        with open(fpath) as f:
            class_dict = json.load(f)
        return [class_dict[str(i)][1] for i in range(len(class_dict))]
    

def plot_with_most_probable_classes(images, predictions, classes, k, figtitle = None):
    #plots images with titles that are k most probable classes
    most_probable_labels  = [[classes[index] + '/' + '{:10.2f}'.format(p[index]) for index in np.argsort(p)[::-1][:k]] for p in predictions]
    titles = ['\n'.join(lst) for lst in most_probable_labels]
    plots(images, titles=titles, figtitle=figtitle)
    
def trim_predictions(predictions, border):
    for i in range(len(predictions)):
        if predictions[i] > 1 - border:
            predictions[i] = 1 - border
        elif predictions[i] < border:
            predictions[i] = border
    return predictions

def save_results(id_result_tuples, filename = 'result.csv'):
    with open(filename, 'w+') as f:
        wr = csv.writer(f)
        wr.writerows(id_result_tuples)


In [None]:
model = BinaryVgg16(data_path, (224, 224))

In [None]:
model.show_validation_images()

In [None]:
model.show_sample_predictions_by_original_model()

In [None]:
model.fit()

In [None]:
model.save('model_cats_dogs_binary.h5')

In [None]:
model.save_test_predictions()

In [None]:
model.save_interesting_cases()
model.plot_interesting_cases()

