In [1]:
#cell-width control
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [2]:
# get noise from random DGP, two way model, non-separate clean and noise, smaller dict (50000)
model_class = "model_class_5"
model_num = "model_1"

In [3]:
#packages
import numpy
import tensorflow as tf
from tensorflow.core.example import example_pb2

#utils
import os
import random
import pickle
import struct
import time
from noise import *

#keras
import keras
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.models import load_model
from keras.layers import Dense, Dropout, Activation, Concatenate, Dot, Embedding, LSTM, Conv1D, MaxPooling1D, Input


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# CPU usage

In [4]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Utils

In [5]:
numpy.random.seed(47)

def get_data(filename, nc_dist, replace, corr_sample, separate, band_width, noise_candidate_path, dgp):
    """
    Args:
        filename (string): path to data file holding clean datapoints, i.e. clean (text, summ) pairs
        nc_dist ((float, float)): (clean_ratio, noise_ratio) tuple describing the desired noise-clean 
                                    distribution in the output dataset, sum(nc_dist) = 1
        replace (bool): whether or not to sample with replacement from clean
        corr_sample (bool): whether to compare some of the orig summs and the noise summs to check for
                            correspondence
        separate (bool): whether to generate noise from a set of texts that is disjoint from the clean data
        band_width (int): number of outputs of G per text
        noise_candidate_path (string): path where the generated noise files are stored
        dgp (string): which DGP is to be used for noise, e.g. "generator" or "random"
    Returns:
        data ((numpy array, numpy array)): (clean, noise) tuple of Nx2 arrays of (text, summ) datapoints 
                                            with noise
    """
    if dgp == "generator":
        #get all the bad indices
        print('Get all the bad indices...')
        alarm_log = {}
        alarm_count = 0
        count = 4
        filenum_old = 'XXXXXX'
        name_old = 'XXXXXX'
        for name_new in sorted(os.listdir(noise_candidates_path)):
            filenum_new = name_new[0:6]
            if filenum_new == filenum_old:
                count += 1
            if filenum_new != filenum_old:
                if count != 4:
                    alarm_count += 1
                    alarm_log[name_old] = count #collect all the bad keys with count
                count = 1
            filenum_old = filenum_new
            name_old = name_new
        all_bad_indices = [int(filename[0:6]) for filename in alarm_log.keys()] #now have all the bad indices
        print('...done!')

        #read in clean data
        print('Reading clean data...')
        text_summ_pairs = []
        with open(filename, 'r') as data:
            text = data.readline()
            summ = data.readline()
            while summ:
                if len(text) > 2 and len(summ) > 2:
                    text_summ_pairs.append([text[0:-1], summ[0:-1]])
                text = data.readline()
                summ = data.readline()
        clean_2d = numpy.array(text_summ_pairs, dtype=object)
        print('...done!')


        #remove bad indices
        print('Remove bad indices from clean data...')
        mask = numpy.ones(clean_2d.shape[0], dtype='bool')
        mask[all_bad_indices] = False
        clean_2d = clean_2d[mask]
        N_clean = clean_2d.shape[0]
        print('...done!')

        #pick indices of noise
        print('Pick noise indices...')
        clean_ratio, noise_ratio = nc_dist
        if separate: #check whether to generate noise from texts disjoint from clean data
            N_noise = int((N_clean*noise_ratio)) #calculate N_noise
            #below make sure we have a separate set of indices for noise which we can delete later for the separate
            #guarantee
            noise_separate_indices = numpy.random.choice(N_clean, size = N_noise, replace=False)
            noise_index_pool = numpy.copy(noise_separate_indices)
            for i in range(1, band_width):
                noise_index_pool = numpy.concatenate((noise_index_pool, noise_separate_indices + i))
            assert noise_index_pool.shape[0] == N_noise*band_width, "noise index pool smaller than expected"
            assert abs(((N_clean - N_noise)/N_clean) - clean_ratio) < 0.0001 \
            and abs((N_noise/N_clean) - noise_ratio) < 0.0001 \
            ,"Something is wrong with N_noise" #check that you calculated N_noise correctly
        else:
            N_noise = int((N_clean - N_clean*clean_ratio)/clean_ratio) #calculate N_noise
            noise_index_pool = numpy.arange(N_clean*band_width)
            assert abs(N_clean/(N_clean + N_noise) - clean_ratio) < 0.0001 \
                and abs(N_noise/(N_clean + N_noise) - noise_ratio) < 0.0001 \
                ,"Something is wrong with N_noise" #check that you calculated N_noise correctly
        noise_summ_indices = numpy.random.choice(noise_index_pool, size=N_noise, replace=replace) #get indices \
            #in the range of N_clean*(band_width of generator run, i.e. number of outputs of G per text)
        assert N_noise == len(noise_summ_indices), "N_noise and len(selected_indices do not match)"
        print('...done!')

        #read in candidate noise points
        print('Read in candidate noise points...')
        candidate_noise = []
        for filename in sorted(os.listdir(noise_candidates_path)):
            if int(filename[0:6]) not in all_bad_indices:
                with open(noise_candidates_path+filename, 'r') as file:
                    candidate_noise.append(file.read().replace('\n', ' ')) #read file, trim \n and add to cand. list
        assert len(candidate_noise) == band_width*clean_2d.shape[0], "less candidates than expected"
        print('...done!')

        #preprocess clean data, i.e. remove <s> and </s>
        print('Preprocess clean data, i.e. remove <s> and </s>...')
        for i in range(N_clean):
            clean_2d[i,1] = clean_2d[i,1].replace('<s> ', '')
            clean_2d[i,1] = clean_2d[i,1].replace(' </s>', '')
        print('...done!')

        if corr_sample: #take some samples to sanity check that refs and generated summs correspond
            print('Sanity check some examples..')
            idx = numpy.random.choice(N_noise, size=10, replace=False)
            for i in idx:
                print('### '+str(i)+' ###')
                print('reference summary '+':\n'+clean_2d[(noise_summ_indices[i] // 4),1])
                print('generated summary'+':\n'+candidate_noise[i])
            print('...done!')

        #put data together
        print('Put data together...')
        noise_texts = clean_2d[(noise_summ_indices // 4),0]
        print('noise_texts.shape[0]', noise_texts.shape[0])
        noise_summs = numpy.array(candidate_noise)[noise_summ_indices]
        print('noise_summs.shape[0]', noise_summs.shape[0])
        noise_2d = numpy.stack((noise_texts,noise_summs), axis=-1)
        assert noise_2d.shape[0] == N_noise and noise_2d.shape[1] == 2, "the noise_2d shape does not check out"
        print('...done!')

        #remove noise source texts from clean data if separate was selected
        if separate: #delete inputs for noise from clean data
            print('prior', clean_2d.shape)
            clean_2d = numpy.delete(clean_2d, noise_separate_indices, axis=0)
            print('after', clean_2d.shape)
            #TODO: this might skrew up the dist in the case were an index appears multiple times in sel_ind

        return clean_2d, noise_2d
    
    elif dgp == 'random':
        #read in clean data
        print('Reading clean data...')
        text_summ_pairs = []
        with open(filename, 'r') as data:
            text = data.readline()
            summ = data.readline()
            while summ:
                if len(text) > 2 and len(summ) > 2:
                    text_summ_pairs.append([text[0:-1], summ[0:-1]])
                text = data.readline()
                summ = data.readline()
        clean_2d = numpy.array(text_summ_pairs, dtype=object)
        print('...done!')
        
        #read in candidate noise points
        print('Read in candidate noise points...')
        fractions = {"switch-pairs":0.25,"sentence-switch-entire-bank":0.25,\
                     "sentence-switch-same-text-bank":0.25,"word-switch-entire-bank":0.25}
        clean_2d, noise_2d = noise_randomDGP(clean_2d, fractions, separate, nc_dist)
        print('...done!')
        
        #preprocess clean data, i.e. remove <s> and </s>
        print('Preprocess clean data, i.e. remove <s> and </s>...')
        for i in range(clean_2d.shape[0]):
            clean_2d[i,1] = clean_2d[i,1].replace('<s> ', '')
            clean_2d[i,1] = clean_2d[i,1].replace(' </s>', '')
        print('...done!')
        
        return clean_2d, noise_2d
    else:
        print('error: no valid DGP was selected')

def prep_data(clean_2d, noise_2d, max_features, val_share, maxlen_text, maxlen_summ,
              load_tok=False, tok_path=None):
    """
    Args:
        clean_2d (numpy array): Nx2 array of text summ tuples with clean points
        noise_2d (numpy array): Nx2 array of text summ tuples with noise points
        max_features (int): max number of words for tokenizer
        val_share (float): share (< 1) of data that is to be used for validation
        load_tok (bool): whether to load tokenizer or train from scratch
        tok_path (string): path to stored tokenizer object
        maxlen_text (int): max length of text after which to cut text
        maxlen_summ (int): max length of summ after which to cut summ
    Returns:
        texts_train_seq (array): (N_clean+N_noise)*(1-val_share)xmaxlen_text array of seq text
        summs_train_seq (array): (N_clean+N_noise)*(1-val_share)xmaxlen_summ array of seq summ
        text_val_seq (array): (N_clean+N_noise)*(val_share)xmaxlen_text array of seq text
        summs_val_seq (array): (N_clean+N_noise)*(val_share)xmaxlen_summ array of seq summ
        tokenizer (tokenizer object): tokenizer object
    """
    #split texts and summs
    texts = numpy.append(clean_2d[:,0], noise_2d[:,0])
    summs = numpy.append(clean_2d[:,1], noise_2d[:,1])
    
    #get targets
    N_clean = clean_2d.shape[0]
    N_noise = noise_2d.shape[0]
    targets = numpy.append([0]*N_clean, [1]*N_noise)
    
    #permute targets and data in the same way
    indices = numpy.random.choice(N_clean+N_noise, size=N_clean+N_noise, replace=False)
    assert len(indices) == N_clean+N_noise, "indices are less N_clean + N_noise"
    texts = texts[indices]
    summs = summs[indices]
    targets = targets[indices]
    
    #split data into train and test
    split = int((N_clean+N_noise)*val_share)
    texts_train = texts[split:]
    summs_train = summs[split:]
    targets_train = targets[split:]
    texts_val = texts[:split]
    summs_val = summs[:split]
    targets_val = targets[:split]
    print('train dist: ', numpy.mean(targets_train)) #just checking what the dists are after permute
    print('val dist: ', numpy.mean(targets_val)) #just checking what the dists are after permute
    
    #train tokenizer
    if load_tok:
        with open(tok_path, 'rb') as handle:
            tokenizer = pickle.load(handle)
    else:
        tokenizer = Tokenizer(num_words=max_features,
                                   filters='#$%&()*+-/:;<=>@[\\]^_{|}~\t\n',
                                   lower=True,
                                   split=" ",
                                   char_level=False)
        tokenizer.fit_on_texts(numpy.append(texts_train, summs_train))
    
    #sequentialize data
    texts_train_seq = tokenizer.texts_to_sequences(texts_train)
    summs_train_seq = tokenizer.texts_to_sequences(summs_train)
    texts_val_seq = tokenizer.texts_to_sequences(texts_val)
    summs_val_seq = tokenizer.texts_to_sequences(summs_val)
    
    #pad data
    texts_train_seq = sequence.pad_sequences(texts_train_seq, maxlen=maxlen_text)
    summs_train_seq = sequence.pad_sequences(summs_train_seq, maxlen=maxlen_summ)
    texts_val_seq = sequence.pad_sequences(texts_val_seq, maxlen=maxlen_text)
    summs_val_seq = sequence.pad_sequences(summs_val_seq, maxlen=maxlen_summ)
    
    return texts_train_seq, summs_train_seq, targets_train, texts_val_seq, summs_val_seq, targets_val, tokenizer

# Global parameters

In [6]:
# Embedding
max_features = 50000
maxlen_text = 400
maxlen_summ = 80
embedding_size = 100 #128

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 30
epochs = 50

#Saving?
save = True

# Load data

In [7]:
#filename = "/home/oala/Documents/MT/data/datasets/finished_files/train.bin"
#noise_candidates_path = '/home/oala/Documents/MT/noising/4-beam-PGC-noise-on-train/pretrained_model_tf1.2.1/decode_train_400maxenc_4beam_35mindec_120maxdec_ckpt-238410/decoded/'
filename = "/home/donald/documents/MT/data/data-essentials-mini/finished_files/val.bin"
noise_candidates_path = ' '
nc_dist = (0.5,0.5)
replace = False
corr_sample = False
separate = False
band_width = 4
dgp = "random"
clean_2d, noise_2d = get_data(filename, nc_dist, replace, corr_sample, separate, band_width,noise_candidates_path, dgp)

Reading clean data...
...done!
Read in candidate noise points...
3342
(3342,)
(13368, 2)
(3342, 2)
...done!
Preprocess clean data, i.e. remove <s> and </s>...
...done!


In [8]:
print(clean_2d.shape)
print(noise_2d.shape)
print(clean_2d[100])
print(noise_2d[100])

(13368, 2)
(13368, 2)
 'marion bartoli asks whether she should make a comeback over twitter . she won wimbledon in 2013 but retired one month later due to injuries . bartoli won eight wta titles during a 13-year professional career .']
 'staffers at the south carolina aquarium are treating a rare , 475-pound leatherback sea turtle . the turtle washed up saturday on a nearby beach and may be returned to the ocean soon .']


In [9]:
val_share = 0.2

texts_train, summs_train, targets_train, texts_val, summs_val, targets_val, tokenizer = \
    prep_data(clean_2d, noise_2d, max_features, val_share, maxlen_text, maxlen_summ,
              load_tok=False, tok_path=None)

train dist:  0.4976857263079153
val dist:  0.509257527585562


# Load embeddings

In [10]:
#GLOVE_DIR = "/home/oala/Documents/MT/data/datasets/glove.6B/"
GLOVE_DIR = "/home/donald/documents/MT/data/data-essentials-mini/glove.6B/"

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = numpy.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [11]:
word_index = tokenizer.word_index

In [12]:
embedding_matrix = numpy.zeros((len(word_index) + 1, embedding_size))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [13]:
#define custom embeddings layer
embedding_layer_text = Embedding(len(word_index) + 1,
                            embedding_size,
                            weights=[embedding_matrix],
                            input_length= maxlen_text,
                            trainable=False)

embedding_layer_summ = Embedding(len(word_index) + 1,
                            embedding_size,
                            weights=[embedding_matrix],
                            input_length= maxlen_summ,
                            trainable=False)

# Build model

In [14]:
#2way input
text_input = Input(shape=(maxlen_text,), dtype='int32')
summ_input = Input(shape=(maxlen_summ,), dtype='int32')

#2way embeddings
text_route = embedding_layer_text(text_input)
summ_route = embedding_layer_summ(summ_input)

#2way dropout
text_route = Dropout(0.25)(text_route)
summ_route = Dropout(0.25)(summ_route)

#2way conv
text_route = Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1)(text_route)
summ_route = Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1)(summ_route)

#2way max pool
text_route = MaxPooling1D(pool_size=pool_size)(text_route)
summ_route = MaxPooling1D(pool_size=pool_size)(summ_route)

#2way lstm
text_route = LSTM(lstm_output_size)(text_route)
summ_route = LSTM(lstm_output_size)(summ_route)

#merge both routes
#merged = keras.layers.concatenate((text_route, summ_route), axis=-1)
#merged = Concatenate(axis=-1)([text_route, summ_route])
merged = Dot(axes=1,normalize=True)([text_route, summ_route])

#output
output = Dense(1, activation='sigmoid')(merged)

#define model
model = Model(inputs=[text_input, summ_input], outputs=[output])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Train model

In [15]:
print('Train...')
model.fit([texts_train, summs_train], targets_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=([texts_val, summs_val], targets_val))

Train...
Train on 21389 samples, validate on 5347 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f3f4347c8d0>

# Interact with model

In [17]:
text_string = "gary d. cohn , president trump’s top economic adviser , said on tuesday that he would resign , becoming the latest in a series of high-profile departures from the trump administration . white house officials insisted that there was no single factor behind the departure of mr. cohn , who heads the national economic council . but his decision to leave came as he seemed poised to lose an internal struggle over mr. trump’s plan to impose large tariffs on steel and aluminum imports. mr. cohn had warned last week that he might resign if mr. trump followed through with the tariffs, which mr. cohn had lobbied against internally .  “ gary has been my chief economic adviser and did a superb job in driving our agenda , helping to deliver historic tax cuts and reforms and unleashing the american economy once again , ” mr. trump said in a statement to the new york times . “ he is a rare talent , and i thank him for his dedicated service to the american people . ”  mr. cohn is expected to leave in the coming weeks. he will join a string of recent departures by senior white house officials, including mr. trump’s communications director and a powerful staff secretary.  yet the departure of mr. cohn , a free-trade-oriented democrat who fended off a number of nationalist-minded policies during his year in the trump administration , could have a ripple effect on the president’s economic decisions and on the financial industry .  it leaves mr. trump surrounded primarily by advisers with strong protectionist views who advocate the types of aggressive trade measures , like tariffs , that mr. trump campaigned on but that mr. cohn fought inside the white house . mr. cohn was viewed by republican lawmakers as the steady hand who could prevent mr. trump from engaging in activities that could trigger a trade war.  even the mere threat , last august , that mr. cohn might leave sent the financial markets tumbling. on tuesday , mr. cohn’s announcement rattled markets , and trading in futures pointed to a decline in the united states stock market when it opened on wednesday .  in a statement , mr. cohn said he had been pleased to work on “pro-growth economic policies to benefit the american people , in particular the passage of historic tax reform . ” white house officials said that mr. cohn was leaving on cordial terms with the president and that they planned to discuss policy even after his departure .  mr. cohn’s departure comes as the white house has been buffeted by turnover , uncertainty and internal divisions and as the president lashes out at the special counsel investigation that seems to be bearing down on his team .  a host of top aides have been streaming out the white house door or are considering a departure . rob porter , the white house staff secretary and a member of the inner circle , resigned after spousal abuse allegations . hope hicks , the president’s communications director and confidante , announced that she would leave soon . in recent days , the president has lost a speechwriter , an associate attorney general and the north korea negotiator .  others are perpetually seen as on the way out . john f. kelly , the chief of staff , at one point broached resigning over the handling of mr. porter’s case . lt. gen. h. r. mcmaster , the national security adviser , has been reported to be preparing to leave . and many officials wonder if jared kushner , the president’s son-in-law and senior adviser , will stay now that he has lost his top-secret security clearance; the departure of mr. cohn further shrinks the number of allies mr. kushner and his wife , ivanka trump , have in the white house .  more than one in three top white house officials left by the end of mr. trump’s first year and fewer than half of the 12 positions closest to the president are still occupied by the same people as when he came into office , according to a brookings institution study .  mr. cohn’s departure will bring the turnover number to 43 percent , according to updated figures compiled by kathryn dunn tenpas of the brookings institution .  for all the swings of the west wing revolving door over the last year , mr. cohn’s decision to leave struck a different chord for people . he is among the most senior officials to resign to date ."

In [18]:
summ_string = "gary d. cohn , president trump’s top economic adviser , said on tuesday that he would resign . more than one in three top white house officials left by the end of mr. trump’s first year ."
summ_string = "angela merkel visited president trump last wednesday . they talked about the iran deal and trade . president trump will travel on to europe next week ."

In [19]:
predict_text = sequence.pad_sequences(tokenizer.texts_to_sequences([text_string]), maxlen=maxlen_text)
predict_summ = sequence.pad_sequences(tokenizer.texts_to_sequences([summ_string]), maxlen=maxlen_summ)

In [20]:
print(model.predict([predict_text, predict_summ], batch_size=1))

[[0.10397304]]


In [21]:
numpy.mean(targets_val)

0.5006487004223084

# Filter outputs

In [88]:
with open(tok_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

In [91]:
from os import scandir, listdir
#read in article texts, baseline summs, m1 summs, m2 summs and ref summs

article_dir = "/home/oala/Documents/MT/data/datasets/finished_files/test_output/articles/"
reference_dir = "/home/oala/Documents/MT/data/datasets/finished_files/test_output/reference/"
baseline_dir = "/home/oala/Documents/MT/data/datasets/finished_files/test_output/baseline/"
pointergen_dir = "/home/oala/Documents/MT/data/datasets/finished_files/test_output/pointer-gen/"
pointergencov_dir = "/home/oala/Documents/MT/data/datasets/finished_files/test_output/pointer-gen-cov/"

article_files = listdir(article_dir)
article_files.sort()
reference_files = listdir(reference_dir)
reference_files.sort()
baseline_files = listdir(baseline_dir)
baseline_files.sort()
pointergen_files = listdir(pointergen_dir)
pointergen_files.sort()
pointergencov_files = listdir(pointergencov_dir)
pointergencov_files.sort()

#read in texts
texts = []
for txt_file in article_files:
    with open(article_dir+txt_file,'r',encoding='utf-8', errors='ignore') as txt:
        text = txt.read()
        text = text.replace('(', '-lrb-')
        text = text.replace(')', '-rrb-')
        text = text.replace('[', '-lsb-')
        text = text.replace(']', '-rsb-')
        text = text.replace('{', '-lcb-')
        text = text.replace('}', '-rcb-')
        texts.append(text)
texts = numpy.array(texts)
texts_text = numpy.copy(texts)
texts = tokenizer.texts_to_sequences(texts)
texts = sequence.pad_sequences(texts, maxlen=maxlen_text)

#helper functions for summs
def summ_dir2array(dir_name, file_list):
    summs = []
    for txt_file in file_list:
        with open(dir_name+txt_file,'r',encoding='utf-8', errors='ignore') as txt:
            summ = ""
            line = txt.readline()
            while line:
                line = line.replace('\n', ' ')
                line = line.replace('(', '-lrb-')
                line = line.replace(')', '-rrb-')
                line = line.replace('[', '-lsb-')
                line = line.replace(']', '-rsb-')
                line = line.replace('{', '-lcb-')
                line = line.replace('}', '-rcb-')
                
                summ += line
                line = txt.readline()

            summs.append(summ)
    summs = numpy.array(summs)
    summs_text = numpy.copy(summs)
    summs = tokenizer.texts_to_sequences(summs)
    summs = sequence.pad_sequences(summs, maxlen=maxlen_summ)
    
    return summs, summs_text

#reference summs
reference_summs, reference_summs_text = summ_dir2array(reference_dir, reference_files)

#baseline summs
baseline_summs, baseline_summs_text = summ_dir2array(baseline_dir, baseline_files)

#pointergen summs
pointergen_summs, pointergen_summs_text = summ_dir2array(pointergen_dir, pointergen_files)

#pointergencov summs
pointergencov_summs, pointergencov_summs_text = summ_dir2array(pointergencov_dir, pointergencov_files)

N = len(texts_text)

In [92]:
#reference
reference_preds = model.predict([texts, reference_summs], batch_size=batch_size, verbose=1)
print(model.evaluate([texts, reference_summs],[0]*texts.shape[0], batch_size=batch_size))

reference_preds_flat = numpy.ndarray.flatten(reference_preds)
reference_preds_onehot = numpy.copy(reference_preds_flat)
reference_preds_onehot[reference_preds_onehot<0.5]=0
reference_preds_onehot[reference_preds_onehot>=0.5]=1
#reference_preds_onehot[reference_preds_onehot<0.02]=0
#reference_preds_onehot[reference_preds_onehot>=0.02]=1
print(sum(reference_preds_onehot)/reference_preds_onehot.shape[0])

[0.3157948085323326, 0.8859878072695072]
0.11401218450826806


In [83]:
#baseline
baseline_preds = model.predict([texts, baseline_summs], batch_size=batch_size, verbose=1)

baseline_preds_flat = numpy.ndarray.flatten(baseline_preds)
baseline_preds_onehot = numpy.copy(baseline_preds_flat)
baseline_preds_onehot[baseline_preds_onehot<0.5]=0
baseline_preds_onehot[baseline_preds_onehot>=0.5]=1
#baseline_preds_onehot[baseline_preds_onehot<0.02]=0
#baseline_preds_onehot[baseline_preds_onehot>=0.02]=1
print(sum(baseline_preds_onehot)/baseline_preds_onehot.shape[0])

0.18398607484769364


In [84]:
#pointergen
pointergen_preds = model.predict([texts, pointergen_summs], batch_size=batch_size, verbose=1)

pointergen_preds_flat = numpy.ndarray.flatten(pointergen_preds)
pointergen_preds_onehot = numpy.copy(pointergen_preds_flat)
pointergen_preds_onehot[pointergen_preds_onehot<0.5]=0
pointergen_preds_onehot[pointergen_preds_onehot>=0.5]=1
#pointergen_preds_onehot[pointergen_preds_onehot<0.02]=0
#pointergen_preds_onehot[pointergen_preds_onehot>=0.02]=1
print(sum(pointergen_preds_onehot)/pointergen_preds_onehot.shape[0])

0.2614447345517842


In [85]:
#pointergencov
pointergencov_preds = model.predict([texts, pointergencov_summs], batch_size=batch_size, verbose=1)

pointergencov_preds_flat = numpy.ndarray.flatten(pointergencov_preds)
pointergencov_preds_onehot = numpy.copy(pointergencov_preds_flat)
pointergencov_preds_onehot[pointergencov_preds_onehot<0.5]=0
pointergencov_preds_onehot[pointergencov_preds_onehot>=0.5]=1
#pointergencov_preds_onehot[pointergencov_preds_onehot<0.02]=0
#pointergencov_preds_onehot[pointergencov_preds_onehot>=0.02]=1
print(sum(pointergencov_preds_onehot)/pointergencov_preds_onehot.shape[0])

0.37885117493472587


In [27]:
if save:
    model.save('%s_%s.h5' % (model_class,model_num))
    with open('%s_%s_TOKENIZER.pickle' % (model_class,model_num), 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [93]:
print(reference_summs_text[500])
print(texts_text[500])
print(clean_2d[500,0])
print(clean_2d[500,1])

nepal civil war aftermath inspired maggie doyne to help children . doyne 's blinknow foundation supports a home for 50 children and a school that educates hundreds more . do you know a hero ? nominations are open for 2015 cnn heroes .
surkhet , nepal -lrb- cnn -rrb- ten years ago , with her high school diploma and a backpack , maggie doyne left her new jersey hometown to travel the world before college . she lived in a buddhist monastery , helped rebuild a sea wall in fiji , then went to india and worked with nepalese refugees . there , she met a young girl who wanted to find her family in nepal . doyne went with her . that 's when doyne 's life took an unexpected turn . do you know a hero ? nominations are open for cnn heroes 2015 . a decade-long civil war had just ended in the country , and doyne witnessed its effects firsthand . she met women and children who were suffering , struggling to survive . `` it changed me , '' said doyne , now 28 . `` there were children with mallets that

# Test model on actual testing data

In [41]:
#load model
model = load_model('/home/oala/Documents/MT/data/model-params/exciting-crazy/%s/%s/%s_%s.h5' % (model_class,model_num,model_class, model_num))
tok_path= '/home/oala/Documents/MT/data/model-params/exciting-crazy/%s/%s/%s_%s_TOKENIZER.pickle' % (model_class,model_num,model_class,model_num)

In [42]:
#100/0 (clean/noise)

In [60]:
filename = "/home/oala/Documents/MT/data/datasets/finished_files/test.bin"
noise_candidates_path = '/home/oala/Documents/MT/noising/4-beam-PGC-noise-on-train/pretrained_model_tf1.2.1/decode_train_400maxenc_4beam_35mindec_120maxdec_ckpt-238410/decoded/'
nc_dist = (0.5,0.5)
replace = False
corr_sample = False
separate = False
band_width = 4
dgp = "random"
clean_2d, noise_2d = get_data(filename, nc_dist, replace, corr_sample, separate, band_width,noise_candidates_path, dgp)

Reading clean data...
...done!
Read in candidate noise points...
2872
(2872,)
(11490, 2)
(2872, 2)
...done!
Preprocess clean data, i.e. remove <s> and </s>...
...done!


In [44]:
print(clean_2d.shape)
print(noise_2d.shape)
#clean_2d = clean_2d[0:2] #here you reduce clean to 2 datapoints to get only noise!
noise_2d = noise_2d[0:2]
val_share = 0.2

(11490, 2)
(11488, 2)


In [45]:
texts_train, summs_train, targets_train, texts_val, summs_val, targets_val, tokenizer = \
    prep_data(clean_2d, noise_2d, max_features, val_share, maxlen_text, maxlen_summ,
              load_tok=True, tok_path=tok_path)

train dist:  0.00010876658690450294
val dist:  0.0004351610095735422


In [46]:
#concat the splits to get all data
predict_text = numpy.concatenate((texts_train, texts_val))
predict_summ = numpy.concatenate((summs_train, summs_val))
predict_y = numpy.concatenate((targets_train, targets_val))

In [47]:
model.evaluate([predict_text, predict_summ], predict_y, batch_size=batch_size)



[0.20658960570462936, 0.9388269976357155]

In [48]:
#50/50 (clean/noise)

In [49]:
filename = "/home/oala/Documents/MT/data/datasets/finished_files/test.bin"
noise_candidates_path = '/home/oala/Documents/MT/noising/4-beam-PGC-noise-on-train/pretrained_model_tf1.2.1/decode_train_400maxenc_4beam_35mindec_120maxdec_ckpt-238410/decoded/'
nc_dist = (0.5,0.5)
replace = False
corr_sample = False
separate = False
band_width = 4
dgp = "random"
clean_2d, noise_2d = get_data(filename, nc_dist, replace, corr_sample, separate, band_width,noise_candidates_path, dgp)

Reading clean data...
...done!
Read in candidate noise points...
2872
(2872,)
(11490, 2)
(2872, 2)
...done!
Preprocess clean data, i.e. remove <s> and </s>...
...done!


In [50]:
print(clean_2d.shape)
print(noise_2d.shape)
#noise_2d = noise_2d[0:2] #here you reduce clean to 2 datapoints to get only noise!
val_share = 0.2

(11490, 2)
(11488, 2)


In [51]:
texts_train, summs_train, targets_train, texts_val, summs_val, targets_val, tokenizer = \
    prep_data(clean_2d, noise_2d, max_features, val_share, maxlen_text, maxlen_summ,
              load_tok=True, tok_path=tok_path)

train dist:  0.4993200239351575
val dist:  0.5025027203482045


In [52]:
#concat the splits to get all data
predict_text = numpy.concatenate((texts_train, texts_val))
predict_summ = numpy.concatenate((summs_train, summs_val))
predict_y = numpy.concatenate((targets_train, targets_val))

In [53]:
model.evaluate([predict_text, predict_summ], predict_y, batch_size=batch_size)



[0.3952894529908317, 0.8331882612223164]

In [54]:
#0/100 (clean/noise)

In [55]:
filename = "/home/oala/Documents/MT/data/datasets/finished_files/test.bin"
noise_candidates_path = '/home/oala/Documents/MT/noising/4-beam-PGC-noise-on-train/pretrained_model_tf1.2.1/decode_train_400maxenc_4beam_35mindec_120maxdec_ckpt-238410/decoded/'
nc_dist = (0.5,0.5)
replace = False
corr_sample = False
separate = False
band_width = 4
dgp = "random"
clean_2d, noise_2d = get_data(filename, nc_dist, replace, corr_sample, separate, band_width,noise_candidates_path, dgp)

Reading clean data...
...done!
Read in candidate noise points...
2872
(2872,)
(11490, 2)
(2872, 2)
...done!
Preprocess clean data, i.e. remove <s> and </s>...
...done!


In [56]:
print(clean_2d.shape)
print(noise_2d.shape)
clean_2d = clean_2d[0:2] #here you reduce clean to 2 datapoints to get only noise!
val_share = 0.2

(11490, 2)
(11488, 2)


In [57]:
texts_train, summs_train, targets_train, texts_val, summs_val, targets_val, tokenizer = \
    prep_data(clean_2d, noise_2d, max_features, val_share, maxlen_text, maxlen_summ,
              load_tok=True, tok_path=tok_path)

train dist:  0.9997824194952132
val dist:  1.0


In [58]:
#concat the splits to get all data
predict_text = numpy.concatenate((texts_train, texts_val))
predict_summ = numpy.concatenate((summs_train, summs_val))
predict_y = numpy.concatenate((targets_train, targets_val))

In [59]:
model.evaluate([predict_text, predict_summ], predict_y, batch_size=batch_size)



[0.587638956993118, 0.7233246312751471]

# F1 comparisons

In [33]:
pure_clean_preds = numpy.ndarray((reference_preds.shape[0],1), dtype='int')
pure_clean_preds[reference_preds<0.5] = 0
pure_clean_preds[reference_preds>=0.5] = 1
pure_clean_preds_one_hot = numpy.zeros((pure_clean_preds.shape[0],2))
pure_clean_preds_one_hot[numpy.arange(reference_preds.shape[0]),pure_clean_preds[:,0]]=1
pure_clean_targets_one_hot = numpy.ones((pure_clean_preds.shape[0],2))
pure_clean_targets_one_hot[:,1]=0

one = model.predict([predict_text, predict_summ], batch_size=batch_size)
pure_noise_preds = numpy.ndarray((one.shape[0],1), dtype='int')
pure_noise_preds[one<0.5] = 0
pure_noise_preds[one>=0.5] = 1
pure_noise_preds_one_hot = numpy.zeros((pure_noise_preds.shape[0],2))
pure_noise_preds_one_hot[numpy.arange(one.shape[0]),pure_noise_preds[:,0]]=1
pure_noise_targets_one_hot = numpy.ones((pure_noise_preds.shape[0],2))
pure_noise_targets_one_hot[:,0] = 0

In [34]:
pure_clean_preds_one_hot.T @ pure_clean_targets_one_hot

array([[9898.,    0.],
       [1592.,    0.]])

In [35]:
pure_noise_preds_one_hot.T @ pure_noise_targets_one_hot

array([[    0.,  1302.],
       [    0., 10188.]])

In [36]:
model.predict([predict_text, predict_summ], batch_size=batch_size)

array([[0.999446  ],
       [0.33823976],
       [0.8246137 ],
       ...,
       [0.98373276],
       [0.99958295],
       [0.99338466]], dtype=float32)