# Implementing CLSM - Keras

## Purpose
The purpose of this notebook is to implement Microsoft's [Convolutional Latent Semantic Model](http://www.iro.umontreal.ca/~lisa/pointeurs/ir0895-he-2.pdf) in Keras, and evaluate it on our dataset.

## Inputs
- This notebook requires *wiki-pages* from the FEVER dataset as an input.

In [1]:
import gc
import pickle
from multiprocessing import Pool, cpu_count

import joblib
import keras
import nltk
import numpy as np
from joblib import Parallel, delayed
from scipy import sparse
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tqdm import tqdm_notebook

import utils

Using TensorFlow backend.


In [2]:
from joblib import Memory
memory = Memory(location='/tmp', verbose=0)

# Preprocessing Data

In [3]:
claims, labels, article_list, claim_set, claim_to_article = utils.extract_fever_jsonl_data("../train.jsonl")

Num Distinct Claims 109810
Num Data Points 125050


In [4]:
with open("train.pkl", "rb") as f:
    train_dict = pickle.load(f)

In [5]:
for idx in range(len(train_dict)):
    if train_dict[idx]['claim'] not in claim_set:
        print("error")

In [12]:
with open("encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

In [13]:
with open("feature_encoder.pkl", "wb") as f:
    pickle.dump(feature_encoder, f)

In [8]:
processed_claims = utils.generate_all_tokens(claims)
all_evidence = []

for query in tqdm_notebook(train_dict):
    all_evidence.extend([utils.preprocess_article_name(i) for i in query['evidence']])
    
processed_claims.extend(utils.generate_all_tokens(list(set(all_evidence))))

possible_tokens = list(set(processed_claims))

encoder = LabelEncoder()
encoder.fit(np.array(sorted(possible_tokens)))

HBox(children=(IntProgress(value=0, max=125051), HTML(value='')))




HBox(children=(IntProgress(value=0, max=125051), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3592118), HTML(value='')))




LabelEncoder()

In [9]:
feature_encoder = {}
for idx, e in tqdm_notebook(enumerate(encoder.classes_)):
    feature_encoder[e] = idx

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [None]:
load_processed_claims = False

In [None]:
claim_set = dict()
evidence_set = dict()
pool = Pool(processes=6)

for t in tqdm_notebook(train_dict):
    claim_set[t['claim']] = utils.tokenize_claim(t['claim'], encoder, feature_encoder)
    evidences = t['evidence']
    evidences = [utils.preprocess_article_name(i.split("http://wikipedia.org/wiki/")[1]) for i in evidences]
    for e in evidences:
        if e not in evidence_set:
            evidence_set[e] = utils.tokenize_claim(e, encoder, feature_encoder)

In [None]:
s = 0
total = 0
for i in tqdm_notebook(range(len(train_dict))):
    articles = [utils.preprocess_article_name(j.split("http://wikipedia.org/wiki/")[1]) for j in train_dict[i]['evidence']]
    true_articles = claim_to_article[train_dict[i]['claim']]
    total += len(true_articles)
    if true_articles[0] not in articles:
            s += 1

In [None]:
articles = [utils.preprocess_article_name(i.split("http://wikipedia.org/wiki/")[1]) for i in train_dict[0]['evidence']]

In [None]:
%load_ext line_profiler

In [None]:
%lprun -f process_claim process_claim(5)

In [None]:
train_dict[0]

In [None]:
all_data = []

article_set = set(article_list)

def process_claim(idx):
    J = 399
    data = {}
    articles = [utils.preprocess_article_name(i.split("http://wikipedia.org/wiki/")[1]) for i in train_dict[idx]['evidence']]
    data['claim'] = utils.tokenize_claim(train_dict[idx]['claim'], encoder)
    true_article = claim_to_article[train_dict[idx]['claim']][0]
    true_article_idx = articles.index(true_article)
    data['positive_article'] = utils.tokenize_claim(true_article, encoder)
    negative_articles = articles[:true_article_idx] + articles[true_article_idx+1:]
    negative_articles = [utils.tokenize_claim(i, encoder) for i in negative_articles]
    for i in range(J):
        data['negative_article_{}'.format(i)] = negative_articles[i]
    return data

# all_data = utils.parallel_process(range(len(train_dict)), process_claim, n_jobs=12)

# with open("all_data.pkl_lucene", "wb") as f:
#     pickle.dump(all_data, f)

In [None]:
if load_processed_claims:
    with open("all_data.pkl", "rb") as f:
        all_data = pickle.load(f)
else:
    all_data = []

    article_set = set(article_list)
    
    def process_claim(idx):
        J = 4
        data = {}
        data['claim'] = utils.tokenize_claim(claims[idx], encoder)
        data['positive_article'] = utils.tokenize_claim(article_list[idx], encoder)
        negative_articles = np.random.choice(list(article_set - set(claim_to_article[claims[idx]])), J)
        negative_articles = [utils.tokenize_claim(i, encoder) for i in negative_articles]
        for i in range(J):
            data['negative_article_{}'.format(i)] = negative_articles[i]
        return data

    all_data = utils.parallel_process(range(len(claims)), process_claim, n_jobs=6)
    
    with open("all_data.pkl", "wb") as f:
        pickle.dump(all_data, f)
    #all_data = Parallel(n_jobs=cpu_count(), verbose=1, prefer="threads")(delayed(process_claim)(i) for i in range(len(claims)))

## Beginning the Model

In [6]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5845569699656758029
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11981596263
locality {
  bus_id: 1
  links {
    link {
      device_id: 1
      type: "StreamExecutor"
      strength: 1
    }
  }
}
incarnation: 3010196439425632895
physical_device_desc: "device: 0, name: GeForce GTX TITAN X, pci bus id: 0000:02:00.0, compute capability: 5.2"
, name: "/device:GPU:1"
device_type: "GPU"
memory_limit: 10056728576
locality {
  bus_id: 1
  links {
    link {
      type: "StreamExecutor"
      strength: 1
    }
  }
}
incarnation: 1431079868980050082
physical_device_desc: "device: 1, name: GeForce GTX TITAN X, pci bus id: 0000:03:00.0, compute capability: 5.2"
]


In [9]:
%run deep_semantic_similarity_keras.py

In [11]:
model = create_model()

In [10]:
from scipy import sparse
from matplotlib import pyplot as plt
import numpy as np
from keras.utils import multi_gpu_model
import keras
import pickle

In [13]:
load_processed_claims = True

In [14]:
if load_processed_claims:
    with open("saved_data.pkl", "rb") as f:
        data = pickle.load(f)
else:
    data = {"claim":[], "positive_article":[], "negative_article_0":[], "negative_article_1":[], \
            "negative_article_2":[], "negative_article_3":[]}

    for d in tqdm_notebook(all_data):
        data['claim'].append(scipy.sparse.vstack(d['claim']))
        data['positive_article'].append(scipy.sparse.vstack(d['positive_article']))
        data['negative_article_0'].append(scipy.sparse.vstack(d['negative_article_0']))
        data['negative_article_1'].append(scipy.sparse.vstack(d['negative_article_1']))
        data['negative_article_2'].append(scipy.sparse.vstack(d['negative_article_2']))
        data['negative_article_3'].append(scipy.sparse.vstack(d['negative_article_3']))

    with open("saved_data.pkl", "wb") as f:
        pickle.dump(data, f)

Next, we work on training the model in a batchsize manner.

In [None]:
y = np.zeros((1, J+1))
y[:,0] = 1
y

In [15]:
def stack_uneven(arrays, fill_value=0.):
        '''
        Fits arrays into a single numpy array, even if they are
        different sizes. `fill_value` is the default value.

        Args:
                arrays: list of np arrays of various sizes
                    (must be same rank, but not necessarily same size)
                fill_value (float, optional):

        Returns:
                np.ndarray
        '''
        sizes = [a.shape for a in arrays]
        max_sizes = np.max(list(zip(*sizes)), -1)
        # The resultant array has stacked on the first dimension
        result = np.full((len(arrays),) + tuple(max_sizes), fill_value)
        for i, a in enumerate(arrays):
          # The shape of this array `a`, turned into slices
          slices = tuple(slice(0,s) for s in sizes[i])
          # Overwrite a block slice of `result` with this array `a`
          result[i][slices] = a
        return result

In [10]:
for k, v in all_data[0].items():
    to_stack = []
    for i in [0]:
        to_stack.append(np.vstack([j.todense() for j in all_data[i][k]]))

NameError: name 'all_data' is not defined

In [16]:
all_data = joblib.load("all_data_lucene_pt_12.pkl", "rb")

In [17]:
class DataGenerator(keras.utils.Sequence):
    """
    Generates data with batch size of 1 sample for the purposes of training our model.
    """
    def __init__(self, data, J, batch_size=32, split=None):
        """
            Sets the initial arguments and creates
            an indicies array to randomize the dataset
            between epochs
        """
        if split:            
            self.indicies = split
        else:
            self.indicies = list(range(len(data)))
        self.data = data
        self.J = J
        self.batch_size = batch_size
        
    def __len__(self):
        return int(np.floor(len(self.indicies) / self.batch_size))
    
    def __getitem__(self, index):
        return self.get_item(index)
    
    def get_item(self, index):            
            
        final = {}
        #idx = self.indicies[index*self.batch_size:(index+1)*self.batch_size]  # help randomly shuffle the dataset
        idx = self.indicies[index]
        for k in self.data[0].keys():
            final[k] = np.expand_dims(sparse.vstack(self.data[idx][k]).todense(),0)
            #print("Stacking array {}".format(k))
            
#             arrays = np.array(arrays)
#             lens = np.array([len(i) for i in arrays])

#             # Mask of valid places in each row
#             mask = np.arange(lens.max()) < lens[:,None]

#             # Setup output array and put elements from data into masked positions
#             out = np.zeros(mask.shape, dtype=arrays.dtype)
#             out[mask] = np.vstack(arrays)
        
            #final[k] = np.array(arrays)
            
        y = np.zeros((self.batch_size, self.J+1))
        y[:,0] = 1

        return final, y
    
    def on_epoch_end(self):
        #np.random.shuffle(self.indicies)
        pass

In [18]:
generator = DataGenerator(all_data, 399, 1)

In [25]:
%timeit d = generator.get_item(6)

367 ms ± 2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
d[0]

In [29]:
d[0].keys()

dict_keys(['negative_article_10', 'negative_article_173', 'negative_article_263', 'negative_article_297', 'negative_article_367', 'negative_article_92', 'negative_article_384', 'negative_article_272', 'negative_article_323', 'negative_article_204', 'negative_article_386', 'negative_article_220', 'negative_article_139', 'negative_article_225', 'negative_article_82', 'negative_article_394', 'negative_article_368', 'negative_article_61', 'negative_article_142', 'negative_article_363', 'negative_article_86', 'negative_article_266', 'negative_article_308', 'negative_article_31', 'negative_article_53', 'negative_article_154', 'negative_article_213', 'negative_article_314', 'negative_article_325', 'negative_article_131', 'negative_article_205', 'negative_article_212', 'negative_article_125', 'negative_article_378', 'negative_article_98', 'negative_article_340', 'negative_article_298', 'negative_article_289', 'negative_article_79', 'negative_article_81', 'negative_article_152', 'negative_artic

In [28]:
d[0]['documents'].shape

KeyError: 'documents'

In [20]:
import gc
gc.collect()

0

In [None]:
idxs_to_remove

In [None]:
idxs_to_remove = []
for idx, e in enumerate(all_data):
    if type(e)!=dict:
        idxs_to_remove.append(idx)

for e in idxs_to_remove[::-1]:
    all_data.pop(e)

In [None]:
parallel_model = multi_gpu_model(model, gpus=2)
parallel_model.compile(loss='categorical_crossentropy',
                       optimizer='adadelta')

In [None]:
model.compile(loss="categorical_crossentropy", optimizer="adadelta", metrics=['accuracy'])

In [None]:
def reset_weights(model):
    session = backend.get_session()
    for layer in model.layers: 
        if hasattr(layer, 'kernel_initializer'):
            layer.kernel.initializer.run(session=session)

In [None]:
reset_weights(model)

In [None]:
model.fit_generator(generator=generator, epochs=20, use_multiprocessing=False)

In [None]:
validation = DataGenerator(data, J, split=range(90000, 125000))

In [None]:
model.evaluate_generator(generator=validation)

In [None]:
for i in tqdm_notebook(range(len(data['claim']))):
    batch = {"claim":[], "positive_article":[], "negative_article_0":[], "negative_article_1":[], \
        "negative_article_2":[], "negative_article_3":[]}
    batch['claim'] = np.expand_dims(data['claim'][i].todense(), 0)
    batch['positive_article'] = np.expand_dims(data['positive_article'][i].todense(), 0)
    batch['negative_article_0'] = np.expand_dims(data['negative_article_0'][i].todense(), 0)
    batch['negative_article_1'] = np.expand_dims(data['negative_article_1'][i].todense(), 0)
    batch['negative_article_2'] = np.expand_dims(data['negative_article_2'][i].todense(), 0)
    batch['negative_article_3'] = np.expand_dims(data['negative_article_3'][i].todense(), 0)
    model.fit(batch, y)

In [None]:
model.fit(data, y)

In [None]:
article_set = set(article_list)

def process_claim(idx):
    data = {}
    data['claim'] = utils.tokenize_claim(claims[idx], encoder)
    data['positive_article'] = utils.tokenize_claim(article_list[idx], encoder)
    negative_articles = np.random.choice(list(article_set - set(claim_to_article[claims[idx]])), J)
    negative_articles = [utils.tokenize_claim(i, encoder) for i in negative_articles]
    data['negative_article'] = negative_articles
    return data

In [None]:
process_claim(0)

In [None]:
np.argwhere(all_data[0]['claim'][0]==0)

In [None]:
model.inputs

In [None]:
model.fit()