In [13]:
#cell-width control
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Imports

In [14]:
#packages
import numpy
import tensorflow as tf
from tensorflow.core.example import example_pb2

#utils
import os
import random
import pickle
import struct
import time
from generators import *

#keras
import keras
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Model, Sequential
from keras.models import load_model
from keras.layers import Dense, Dropout, Activation, Concatenate, Dot, Embedding, LSTM, Conv1D, MaxPooling1D, Input, Lambda
    #callbacks
from keras.callbacks import TensorBoard, ModelCheckpoint, Callback


In [21]:
with open('stats-4.pickle', 'rb') as handle: stats = pickle.load(handle)
print(stats)

{'tw_on_pseudorandom': {0.1: {'on_clean': 0.98946726, 'on_pseudo': 0.6665215790271759, 'on_generator': 0.1353875994682312, 'on_uniform': 0.9892061278223991}, 0.3: {'on_clean': 0.96805364, 'on_pseudo': 0.7550487518310547, 'on_generator': 0.2517458200454712, 'on_uniform': 1.0}, 0.5: {'on_clean': 0.94246167, 'on_pseudo': 0.8044916391372681, 'on_generator': 0.3447974920272827, 'on_uniform': 0.9996518105908763}, 0.6: {'on_clean': 0.92583567, 'on_pseudo': 0.827907383441925, 'on_generator': 0.3997032046318054, 'on_uniform': 1.0}, 0.7: {'on_clean': 0.9045961, 'on_pseudo': 0.8512360751628876, 'on_generator': 0.45975905656814575, 'on_uniform': 1.0}, 0.75: {'on_clean': 0.8904944, 'on_pseudo': 0.8634227067232132, 'on_generator': 0.49773043394088745, 'on_uniform': 1.0}, 0.8: {'on_clean': 0.87299794, 'on_pseudo': 0.8783077970147133, 'on_generator': 0.5467004179954529, 'on_uniform': 1.0}, 0.81: {'on_clean': 0.8680362, 'on_pseudo': 0.8821378797292709, 'on_generator': 0.5553421676158905, 'on_uniform': 

In [24]:
def get_f1(clean_data, pseudo_data, generator_data, uniform_data):
    tpr_data = []
    tnr_data = []
    for i in range(len(clean_data)):
        TP = clean_data[i] + 1e-10
        TN = pseudo_data[i] + generator_data[i] + uniform_data[i] + 1e-10
        FN = 1 - clean_data[i] + 1e-10
        FP = 3 - pseudo_data[i] - generator_data[i] - uniform_data[i] + 1e-10
        tpr = TP/(TP+FN)
        tnr = TN/(TN+FP)
        
        tpr_data.append(tpr)
        tnr_data.append(tnr)
    return tpr_data, tnr_data

model_name = 'tw_on_pseudorandom'
clean_data = []
pseudo_data = []
generator_data = []
uniform_data = []

thresholds_data = []

for threshold in sorted(stats[model_name].keys()):
    thresholds_data.append(threshold)
    for data_type in stats[model_name][threshold].keys():
        if data_type == 'on_clean':
            clean_data.append(stats[model_name][threshold][data_type])
        if data_type == 'on_pseudo':
            pseudo_data.append(stats[model_name][threshold][data_type])
        if data_type == 'on_generator':
            generator_data.append(stats[model_name][threshold][data_type])
        elif data_type == 'on_uniform':
            uniform_data.append(stats[model_name][threshold][data_type])

#get prec, rec, f1
tpr_data_twd, tnr_data_twd = get_f1(clean_data, pseudo_data, generator_data, uniform_data)

In [27]:
results = numpy.concatenate((numpy.array(thresholds_data)[:,numpy.newaxis], 1-numpy.array(tnr_data_twd)[:,numpy.newaxis], 1-numpy.array(tpr_data_twd)[:,numpy.newaxis] ), axis=1)

In [28]:
print(results)

[[1.00000000e-01 4.02961565e-01 1.05327369e-02]
 [3.00000000e-01 3.31068476e-01 3.19463612e-02]
 [5.00000000e-01 2.83686353e-01 5.75383306e-02]
 [6.00000000e-01 2.57463137e-01 7.41643310e-02]
 [7.00000000e-01 2.29668289e-01 9.54039098e-02]
 [7.50000000e-01 2.12948953e-01 1.09505594e-01]
 [8.00000000e-01 1.91663928e-01 1.27002060e-01]
 [8.10000000e-01 1.87506651e-01 1.31963790e-01]
 [8.20000000e-01 1.82884298e-01 1.38144136e-01]
 [8.30000000e-01 1.78347712e-01 1.43976331e-01]
 [8.40000000e-01 1.73317281e-01 1.51201248e-01]
 [8.50000000e-01 1.68432904e-01 1.58339143e-01]
 [8.60000000e-01 1.62966261e-01 1.66434527e-01]
 [8.70000000e-01 1.57411769e-01 1.74965203e-01]
 [8.80000000e-01 1.50577900e-01 1.86977744e-01]
 [8.90000000e-01 1.43743547e-01 1.99425459e-01]
 [9.00000000e-01 1.36328310e-01 2.12047339e-01]
 [9.10000000e-01 1.27574871e-01 2.25278556e-01]
 [9.20000000e-01 1.19170497e-01 2.44690120e-01]
 [9.30000000e-01 1.09720357e-01 2.65233278e-01]
 [9.40000000e-01 9.83490832e-02 2.983983

# CPU usage

In [15]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Global parameters

In [29]:
# Embedding
max_features = 400000
maxlen_text = 400
maxlen_summ = 80
embedding_size = 100 #128

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 32
epochs = 5

In [30]:
use_multiprocessing = True
workers = 4
shuffle = False

model_path_prefix = '/home/donald/documents/MT/implementation-and-experiments/'
#model_path_prefix = '/home/oala/Documents/MT/implementation-experiments/'
#data_path_prefix = '/mnt/disks/500gb/experimental-data-mini/experimental-data-mini/'
#data_path_prefix = '/media/oala/4TB/experimental-data/'
data_path_prefix = '/mnt/disks/500gb/experimental-data-mini/'

In [31]:
#get preprocessing data
processing_dir = '/mnt/disks/500gb/stats-and-meta-data/400000/'
#processing_dir = '/media/oala/4TB/experimental-data/stats-and-meta-data/400000/'
with open(processing_dir+'tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle)
embedding_matrix = numpy.load(processing_dir+'embedding_matrix.npy')
#stats
maxi = numpy.load(processing_dir+'training-stats-all/maxi.npy')
mini = numpy.load(processing_dir+'training-stats-all/mini.npy')
sample_info = (numpy.random.uniform, mini,maxi)

In [32]:
list_of_outputs = ['output-scoring/baseline/', 'output-scoring/pg/', 'output-scoring/pgc/','output-scoring/sumgan/']
model_path = model_path_prefix + 'exciting-crazy/experiments/tw-on-pseudorandom/1/best.h5'
threshold = 0.97

In [33]:
model = load_model(model_path)

for output_suffix in list_of_outputs:
    #eval on clean test
    data_dir = data_path_prefix + output_suffix
    with open(data_dir+'partition.pickle', 'rb') as handle: partition = pickle.load(handle)
    with open(data_dir+'labels.pickle', 'rb') as handle: labels = pickle.load(handle)
    
    #batch generator parameters
    params = {'dim': [(maxlen_text,embedding_size),(maxlen_summ,embedding_size)],
              'batch_size': batch_size,
              'shuffle': shuffle,
             'tokenizer':tokenizer,
             'embedding_matrix':embedding_matrix,
             'maxlen_text':maxlen_text,
             'maxlen_summ':maxlen_summ,
             'data_dir':data_dir,
             'sample_info':sample_info}
    #generators
    test_generator = ContAllGenerator(partition['test'], labels, **params)
    # Train model on dataset
    #out = model.evaluate_generator(generator=test_generator,
                        #use_multiprocessing=use_multiprocessing,
                        #workers=workers)

    preds = model.predict_generator(generator=test_generator,
                        use_multiprocessing=use_multiprocessing,
                        workers=workers)
    preds[preds<threshold] = 0
    preds[preds!=0] = 1
    
    print(output_suffix+': '+str(numpy.mean(preds)))

output-scoring/baseline/: 0.2956128
output-scoring/pg/: 0.18114555
output-scoring/pgc/: 0.108461
output-scoring/sumgan/: 0.1529422
