In [None]:
import pandas as pd
import numpy as np
from keras import layers
from keras import models
from keras import optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import os, shutil
import glob
import tqdm
from PIL import Image
import seaborn as sns
from keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img, array_to_img
import pickle
import random
import zlib
random.seed = 0
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input, decode_predictions
import keras.backend as k
import tensorflow as tf
from keras.regularizers import l2
from clarifai.rest import ClarifaiApp
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import shutil
import Algorithmia
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
class Ditto:
    def __init__(self, folder, num_api_calls, num_classes, api, test_size=.1,  img_or_nlp='img'):
        self.folder = folder+'/'
        self.api = api
        self.num_api_calls = num_api_calls
        self.num_classes = num_classes
        self.all_generated_filepaths = []
        self.newest_file_paths = []
        self.predictions = []
        self.analysis_type = str(img_or_nlp)
        self.calls_to_api = 0
        self.test_size=test_size
        self.gradients =[]
    
    def generate_cnn_dataset(self, min_generations=3):
        self.dir_path = self.folder+'Ditto'
        if os.path.exists(self.dir_path):
            shutil.rmtree(self.dir_path)
        os.mkdir(self.dir_path)
        self.test_dir_path = self.dir_path+'/Test'
        os.mkdir(self.test_dir_path)
        gens_vs_sample_size = {}
        num_test = int((self.num_api_calls*self.test_size)//1)
        max_gens = np.log(self.num_api_calls-num_test)/np.log(self.num_classes)//1
        for i in range(min_generations,int(max_gens)):
            gens_vs_sample_size.update({i:(self.num_api_calls/self.num_classes**(i))//1})
        print(gens_vs_sample_size)
        gens = int(input('How many generations? '))
        self.init_gens = gens
        self.gens_to_go = gens
        init_sample_size = gens_vs_sample_size[gens]
        self.init_sample_size = int(init_sample_size)
        
        pics = [file for file in os.listdir(str(self.folder))]
        sample = random.sample(pics,k=self.init_sample_size)
        for pic in sample:
            filepath = self.folder+pic
            shutil.copy(filepath,self.dir_path)
        unused_pics = [file for file in pics if file not in sample]
        test_set = random.sample(unused_pics,k=num_test)
        unused_pics = [file for file in pics if (file not in sample)&(file not in test_set)]
        self.unused_pics = unused_pics
        self.test_set_paths = []
        for pic in test_set:
            filepath = self.folder+'/'+pic
            shutil.copy(filepath,self.test_dir_path)
            self.test_set_paths.append(filepath)
        self.target = [int(file.split('_')[1]) for file in sample]
        self.target = to_categorical(self.target)
        self.val_target = [int(file.split('_')[1]) for file in test_set]
        self.val_target = to_categorical(self.val_target)
        dataset = np.ndarray(shape=(self.init_sample_size, 224, 224, 3))
        for i in range(self.init_sample_size):
            img = load_img(self.folder+'/'+sample[i],target_size = (224,224))
            img_arr = img_to_array(img)
            arr = np.expand_dims(img_arr, axis=0)
            arr = preprocess_input(arr)
            dataset[i]=arr
        self.cnn_dataset = dataset
        self.init_files = sample 
        self.init_pics_paths = [self.folder+'/'+i for i in sample]
        test_dataset = np.ndarray(shape=(len(self.test_set_paths), 224, 224, 3))
        for i in range(len(self.test_set_paths)):
            img = load_img(self.test_set_paths[i],target_size = (224,224))
            img_arr = img_to_array(img)
            arr = np.expand_dims(img_arr, axis=0)
            arr = preprocess_input(arr)
            test_dataset[i]=arr
        self.test_dataset = test_dataset
        self.other_pics = unused_pics
    
    def instantiate_nn(self,optimizer='default',loss='default',metric='default',output_activation='default'):
        if self.analysis_type=='img':
            if optimizer=='default':
                sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
                optimizer='sgd'
            if loss=='default':
                loss='categorical_crossentropy'
            if metric == 'default':
                metric = 'accuracy'
            if output_activation=='default':
                output_activation = 'softmax'
            model = models.Sequential()
            model.add(layers.InputLayer(input_shape=(224,224,3)))
            model.add(layers.Dropout(.3))
            model.add(layers.convolutional.Conv2D(10, (2,2), strides=(1, 1), kernel_regularizer=l2(.01), padding='same', activation='relu'))
            model.add(layers.MaxPooling2D(pool_size=(2, 2)))
            model.add(layers.Dropout(.5))
            model.add(layers.convolutional.Conv2D(20, (3,3), strides=(1, 1), padding='valid', activation='relu'))
            model.add(layers.MaxPooling2D(pool_size=(2, 2)))
            model.add(layers.Flatten())
            model.add(layers.Dense(units=64, activation='relu'))
            model.add(layers.Dropout(.5))
            model.add(layers.Dense(units=32, activation='relu'))
            model.add(layers.Dropout(.5))
            model.add(layers.Dense(units=self.num_classes, input_dim=50,activation=output_activation))
            model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
        #elif:
            #self.analysis_type=='nlp':
            self.model = model
        else:
            print('DIY')
                
    def Algorithmia_query(self, api_key):
        client = Algorithmia.client(api_key)
        algo = client.algo('deeplearning/GenderClassification/2.0.0')
        testing_responses = []
        for path in test_set_paths:
            testing_responses.append(algo.pipe(path).result)
        pickle.dump(testing_responses,open('algorithmia_testing_responses','wb'))
        self.calls_to_api += len(testing_responses)
        gender = []
        for i in range(len(testing_responses[0]['results'][0]['gender'])):
            gender.append(algorithmia_test['results'][0]['gender'][i]['confidence'])
            
            
    def Clarifai_query(self, api_key=api_key, dataset=None):
#     Query Clarifai
        print('Querying Oracle')
        if dataset==None:
            dataset=self.cnn_dataset
        training_responses = []
        testing_responses = []
        replacement_responses = []
        problems = []
        app = ClarifaiApp(api_key=api_key)
        clarifai = app.models.get('demographics')
        
        #test set oracle predictions
        filepaths = self.test_set_paths
        for path in filepaths:
            test_response = clarifai.predict_by_filename(path)
            testing_responses.append(testing_response)
            self.calls_to_api += 1
        extract_values(testing_responses,train_or_test='test')
        deal_with_problems(problems,self.test_folder)
        
        
        #train set oracle predictions
        if self.gens_to_go==self.init_gens:
            filepaths = self.init_pics_paths
        else:
            filepaths =self.newest_file_paths
        for path in filepaths:
            response = clarifai.predict_by_filename(path)
            training_responses.append(testing_response)
            self.calls_to_api += 1
        extract_values(testing_responses)
        deal_with_problems(problems, self.folder)
        #get responses for replacement pictures
        extract_values(replacement_responses)
        self.gens_to_go -= 1
        

#         if self.init_gens == self.gens_to_go:
#             print(accuracy_score(self.target,(clarifai_predictions.astype(int)[:,1])))
        
    
    def extract_values(responses,train_or_test='train'):
        for i in range(len(responses)):
            try:
                gender_base = responses[i]['outputs'][0]['data']['regions'][0]['data']['face']['gender_appearance']['concepts']
                genders = []
                for j in range(len(gender_base)):
                    value = gender_base[j]['value']
                    genders.append(value)
                if train_or_test == 'train':
                    self.predictions.append(genders)
                else:
                    self.test_predictions.append(genders)
            except:
                if train_or_test == 'train':
                    problems.append(i)
                else:
                    pass

    
    def deal_with_problems(problems, train_or_test='test'):
        problems = sorted(problem, reverse=True)
        for i in problems:
            del filepaths[i]
        self.cnn_dataset = np.delete(self.cnn_dataset,problems,axis=0)
        
        if self.init_gens==self.gens_to_go:
            for i in problems:
                del self.target[i]
            max_replacements = self.num_api_calls - self.calls_to_api*self.num_classes**self.init_gens
            num_replacements = min([max_replacements,len(problems)])
            replacements = random.sample(self.unused_pics,k=num_replacements)
            for pic in replacements:
                filepath = self.folder+pic
                shutil.copy(filepath,folder)
                replacement_responses.append(testing_response)
                self.calls_to_api += 1

    
    def train_nn(self,model='default', data='default', predictions='default', epochs=10):
        if data == 'default':
            data = self.cnn_dataset
        if predictions == 'default':
            predictions = self.predictions
        if model=='default':
            model=self.model
        if self.gens_to_go == 0:
                callbacks = ModelCheckpoint('weights.best.hdf5', 
                                monitor='val_acc', 
                                verbose=1, 
                                save_best_only=True, 
                                mode='max')
                model.fit(
                    data,np.array(self.predictions),
                    epochs=10,  
                    batch_size=256,
                    validation_data=(data[:self.init_sample_size],np.array(predictions)[:self.init_sample_size]),
                    callbacks=[callbacks])
                print(self.calls_to_api)
                return data, predictions
            
            #Train CNN on dataset with oracle predictions = y
        if (len(predictions) > 100) & (len(predictions) < 1000):
            model.fit(data,
                      np.array(predictions),
                      epochs=10,
                      batch_size=50) 

        elif len(self.predictions) > 1000:
            model.fit(data,
                      np.array(predictions),
                      epochs=10,
                      batch_size=256)

        else:
            model.fit(data,
                      np.array(predictions),
                      epochs=10) 
    
    
    def jacobian_augmentation(self, dataset='default',predictions='default',model='default', lmbda=0.1):
        grads = []
        self.newest_file_paths = []
        if dataset=='default':
            dataset = self.cnn_dataset
        if model=='default':
            model = self.model
        if predictions=='default':
            predictions = self.predictions
        for i in tqdm.tqdm(range(self.num_classes)):
            gradient = k.gradients(model.output[:,i],model.input)[0]
            self.gradients.append(gradient)
            session = k.get_session()
            session.run(tf.initialize_all_variables())
            grads.append(session.run([tf.sign(gradient)], feed_dict={model.input: dataset})[0])
        synth = np.vstack([dataset, dataset])
        predictions_list = [int(i[1]) for i in predictions]
        n_preds = len(dataset)
        for ind, x in tqdm.tqdm(enumerate(dataset)):
            grad = grads[predictions_list[ind]][ind]
            synth[n_preds+ind] = synth[len(dataset)+ind] + lmbda * grad
        self.cnn_dataset = synth
        new = synth[n_preds:]
        print('Saving new data points')
        for ind, new_file in tqdm.tqdm(enumerate(new)):
            path = self.dir_path+'/'+str(n_preds+ind)+'.jpg'
            self.all_generated_filepaths.append(path)
            self.newest_file_paths.append(path)
            img = image.array_to_img(new_file)
            img.save(path)
            
    def Ditto_I_Choose_You(self):
        print('Calls to API: ',ditto.calls_to_api)
        ditto.generate_cnn_dataset()
        ditto.instantiate_nn()
        while self.calls_to_api+0.5*len(self.cnn_dataset)<self.num_api_calls:
            self.Clarifai_query()
            self.train_nn()
            if self.gens_to_go>0:
                self.jacobian_augmentation()
    
    def YYY(new_datapoint_path):
        """Outputs variable importance """
        img = load_img(new_datapoint_path,target_size = (224,224))
        img_arr = img_to_array(img)
        arr = np.expand_dims(img_arr, axis=0)
        arr = preprocess_input(arr)
        yhat = self.model.predict(arr)
        classification = int(round(yhat[1]))
        grad = self.gradients[classification]
        jacobed = grad*arr
        matrix - k.sum(jacobed)
        return sns.heatmap(matrix)
        