# Preprocessing Inputs


In [1]:
import gc
import glob
import itertools
import os
import pickle

import joblib
import keras
import numpy as np
from scipy import sparse
from tqdm import tqdm_notebook

import utils

Using TensorFlow backend.


In [2]:
train = joblib.load("train.pkl")

In [3]:
encoder = utils.ClaimEncoder()

In [4]:
claims, labels, article_list, claim_set, claim_to_article = utils.extract_fever_jsonl_data("../train.jsonl")

Num Distinct Claims 109810
Num Data Points 125050


In [63]:
l = [sparse.csr_matrix((10,5)), sparse.csr_matrix((10,15))]

In [48]:
nm = sparse.csr_matrix((5, 5))

In [31]:
def stack_uneven(arrays, fill_value=0.):
    '''
    Fits arrays into a single numpy array, even if they are
    different sizes. `fill_value` is the default value.

    Args:
            arrays: list of np arrays of various sizes
                (must be same rank, but not necessarily same size)
            fill_value (float, optional):

    Returns:
            np.ndarray
    '''
    sizes = [a.shape for a in arrays]
    max_sizes = np.max(list(zip(*sizes)), -1)
    # The resultant array has stacked on the first dimension
    result = np.zeros((len(arrays),) + tuple(max_sizes))
    for i, a in enumerate(arrays):
      # The shape of this array `a`, turned into slices
      slices = tuple(slice(0,s) for s in sizes[i])
      # Overwrite a block slice of `result` with this array `a`
      result[i][slices] = a
    return result

In [77]:
class DataGenerator(keras.utils.Sequence):
    """
    Generates data with batch size of 1 sample for the purposes of training our model.
    """
    def __init__(self, data, batch_size=32, split=None):
        """
            Sets the initial arguments and creates
            an indicies array to randomize the dataset
            between epochs
        """
        if split:            
            self.indicies = split
        else:
            self.indicies = list(range(len(data)))
        self.data = data
        encoder = utils.ClaimEncoder()
        self.batch_size = batch_size
        _, _, _, _, self.claim_to_article = utils.extract_fever_jsonl_data("../train.jsonl")
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.get_item(index)
    
    def get_item(self, index):            
        d = self.data[index]
        claim = sparse.vstack(encoder.tokenize_claim(d['claim'])).toarray()
        evidences = []
        ys = []
        for e in d['evidence']:
            processed = utils.preprocess_article_name(e.split("http://wikipedia.org/wiki/")[1])
            tokenized = encoder.tokenize_claim(processed)
            dense_arrs = [i.toarray() for i in tokenized]
            stacked_arr = np.vstack(dense_arrs)
            #stacked_arr = stacked_arr.toarray()
            evidences.append(stacked_arr)
            if processed in self.claim_to_article[d['claim']]:
                ys.append(1)
            else:
                ys.append(0)

        evidences = stack_uneven(evidences)
        claim = claim[np.newaxis, :, :]
        claim = np.broadcast_to(claim, (len(evidences), claim.shape[1], claim.shape[2]))
        return {"claim": claim, "document":evidences}, np.array(ys)
    
    def on_epoch_end(self):
        #np.random.shuffle(self.indicies)
        pass

In [78]:
gen = DataGenerator(train)

Num Distinct Claims 109810
Num Data Points 125050


In [12]:
%load_ext line_profiler

In [63]:
%lprun -f utils.ClaimEncoder.tokenize_claim gen.get_item(0)

In [79]:
model = create_model()

In [68]:
%run deep_semantic_similarity_keras.py

In [28]:
c[0]['document'].shape

(400, 24, 29243)

In [None]:
c[1].sum()

In [None]:
claim_to_article[train[4]['claim']]

In [75]:
import gc
gc.collect()

0

In [80]:
model.fit_generator(gen, workers=2, max_queue_size=1, use_multiprocessing=False)

Epoch 1/1
    12/125051 [..............................] - ETA: 424:06:39 - loss: 15.8826

ResourceExhaustedError: OOM when allocating tensor with shape[400,29243,1,51] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node training_1/Adam/gradients/conv1d_4/convolution/Conv2D_grad/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}} = Transpose[T=DT_FLOAT, Tperm=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"](conv1d_4/convolution/ExpandDims, PermConstNHWCToNCHW-LayoutOptimizer)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[{{node loss_1/mul/_271}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_1021_loss_1/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


In [None]:
model.fit_generator(gen, workers=1, max_queue_size=1, pickle_safe=False)

In [None]:
pickles[0].keys()

In [None]:
pickles = []
for file in tqdm_notebook(glob.glob("*.pkl")):
    pickles.append(joblib.load(file))

In [None]:
with open("part_")

In [None]:
with open("train.pkl", "rb") as f:
    train = pickle.load(f)

In [None]:
encoder = utils.ClaimEncoder()

In [None]:
claims = [utils.preprocess_article_name(i['claim']) for i in train]

In [None]:
claims_dict = {}
for c in tqdm_notebook(claims):
    claims_dict[c] = encoder.tokenize_claim(c)

In [None]:
with open("claims_dict.pkl", "rb") as f:
    claims_dict = pickle.load(f)

In [None]:
p0[list(claims_dict.keys())[0]]

In [None]:
claims_dict.keys()

In [None]:
article_names = set()
for t in tqdm_notebook(train):
    for e in t['evidence']:
        name = utils.preprocess_article_name(e)
        article_names.add(name)

In [None]:
def process(t):
    return [utils.preprocess_article_name(i.split("http://wikipedia.org/wiki/")[1]) for i in t['evidence']]

In [None]:
process(train[0])

In [None]:
names = utils.parallel_process(train, process, n_jobs=10)

In [None]:
names = list(set(itertools.chain.from_iterable(names)))

In [None]:
len(names)

In [None]:
names[0]

In [None]:
names = names[1:]

In [None]:
%load_ext line_profiler

In [None]:
names[5]

In [None]:
%lprun -f encoder.tokenize_claim encoder.tokenize_claim(names[8])

In [None]:
article_dict = {}
for name in tqdm_notebook(names):
    article_dict[name] = encoder.tokenize_claim(name)
return article_dict

In [None]:
def transform(n):
    article_dict = {}
    for name in n:
        article_dict[name] = encoder.tokenize_claim(name)
    return article_dict

In [None]:
from joblib import Parallel, delayed

In [None]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [None]:
len(names)//10

In [None]:
processed = Parallel(n_jobs=10, verbose=1, prefer="threads")(delayed(transform)(n) for n in chunks(names, len(names)//100000))

In [None]:
transformed = utils.parallel_process(names, encoder.tokenize_claim, n_jobs=10)