# Implementing CLSM - Keras

## Purpose
The purpose of this notebook is to implement Microsoft's [Convolutional Latent Semantic Model](http://www.iro.umontreal.ca/~lisa/pointeurs/ir0895-he-2.pdf) in Keras, and evaluate it on our dataset.

## Inputs
- This notebook requires *wiki-pages* from the FEVER dataset as an input.

## Preprocessing Data

In [None]:
import numpy as np
import nltk
import utils
import pickle

from scipy import sparse
from joblib import Parallel, delayed
from multiprocessing import cpu_count
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from tqdm import tqdm_notebook

In [None]:
claims, labels, article_list, claim_set, claim_to_article = utils.extract_fever_jsonl_data("../train.jsonl")

In [3]:
def generate_all_tokens(arr):
    all_tokens = []
    for unprocessed_claim in tqdm_notebook(arr):
        c = utils.preprocess_article_name(unprocessed_claim)
        c = "! {} !".format(c)
        for word in c.split():
            letter_tuples = list(nltk.ngrams("#" + word + "#", 3))
            letter_grams = []
            for l in letter_tuples:
                letter_grams.append("".join(l))
            all_tokens.extend(letter_grams)
    return all_tokens

In [4]:
processed_claims = generate_all_tokens(claims)
processed_claims.extend(generate_all_tokens(article_list))

possible_tokens = list(set(processed_claims))

encoder = LabelEncoder()
encoder.fit(np.array(sorted(possible_tokens)))

In [5]:
feature_encoder = {}
for idx, e in tqdm_notebook(enumerate(encoder.classes_)):
    feature_encoder[e] = idx

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [6]:
def tokenize_claim(c, enc):
    """
    Input: a string that represents a single claim
    Output: a list of 3x|vocabulary| arrays that has a 1 where the letter-gram exists.
    """
    encoded_vector = []
    c = utils.preprocess_article_name(c)
    c = "! {} !".format(c)
    for ngram in nltk.ngrams(nltk.word_tokenize(c), 3):
        arr = sparse.lil_matrix((3, len(enc.__dict__['classes_'])))
        for idx, word in enumerate(ngram):
            for letter_gram in nltk.ngrams("#" + word + "#", 3):
                s = "".join(letter_gram)
                letter_idx = feature_encoder[s]
                arr[idx, letter_idx] = 1
        encoded_vector.append(arr)
    return encoded_vector

In [7]:
load_processed_claims = True

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [8]:
load_processed_claims = False

In [10]:
import joblib

In [None]:
if load_processed_claims:
    with open("all_data.pkl", "rb") as f:
        all_data = pickle.load(f)
else:
    all_data = []

    article_set = set(article_list)
    
    def process_claim(idx):
        J = 50
        data = {}
        data['claim'] = tokenize_claim(claims[idx], encoder)
        data['positive_article'] = tokenize_claim(article_list[idx], encoder)
        negative_articles = np.random.choice(list(article_set - set(claim_to_article[claims[idx]])), J)
        negative_articles = [tokenize_claim(i, encoder) for i in negative_articles]
        for i in range(J):
            data['negative_article_{}'.format(i)] = negative_articles[i]
        return data

    all_data = utils.parallel_process(range(90000), process_claim, n_jobs=6)
    
    joblib.dump(all_data, "all_data_2.pkl")
    #with open("all_data_2.pkl", "wb") as f:
    #    pickle.dump(all_data, f)
    #all_data = Parallel(n_jobs=cpu_count(), verbose=1, prefer="threads")(delayed(process_claim)(i) for i in range(len(claims)))

HBox(children=(IntProgress(value=0, max=89997), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




## Beginning the Model

In [None]:
%run deep_semantic_similarity_keras.py

In [None]:
from scipy import sparse
from matplotlib import pyplot as plt
import numpy as np
from keras.utils import multi_gpu_model
import keras
import pickle

In [None]:
load_processed_claims = True

In [None]:
if load_processed_claims:
    with open("saved_data.pkl", "rb") as f:
        data = pickle.load(f)
else:
    data = {"claim":[], "positive_article":[], "negative_article_0":[], "negative_article_1":[], \
            "negative_article_2":[], "negative_article_3":[]}

    for d in tqdm_notebook(all_data):
        data['claim'].append(sparse.vstack(d['claim']))
        data['positive_article'].append(sparse.vstack(d['positive_article']))
        data['negative_article_0'].append(sparse.vstack(d['negative_article_0']))
        data['negative_article_1'].append(sparse.vstack(d['negative_article_1']))
        data['negative_article_2'].append(sparse.vstack(d['negative_article_2']))
        data['negative_article_3'].append(sparse.vstack(d['negative_article_3']))

    with open("saved_data.pkl", "wb") as f:
        pickle.dump(data, f)

Next, we work on training the model in a batchsize manner.

In [None]:
len(data['claim'])

In [None]:
y = np.zeros((1, J+1))
y[:,0] = 1
y

In [None]:
data.keys()

In [None]:
class DataGenerator(keras.utils.Sequence):
    """
    Generates data with batch size of 1 sample for the purposes of training our model.
    """
    def __init__(self, data, J, batch_size=32, split=None):
        """
            Sets the initial arguments and creates
            an indicies array to randomize the dataset
            between epochs
        """
        if split:            
            self.indicies = split
        else:
            self.indicies = list(range(len(data)))
        self.data = data
        self.J = J
        self.batch_size = batch_size
        
    def __len__(self):
        return int(np.floor(len(self.indicies) / self.batch_size))
    
    def __getitem__(self, index):
        return self.get_item(index)
    
    def get_item(self, index):            
            
        final = {}
        #idx = self.indicies[index*self.batch_size:(index+1)*self.batch_size]  # help randomly shuffle the dataset
        idx = self.indicies[index]
        for k in self.data.keys():
            final[k] = np.expand_dims(self.data[k][idx].todense(),0)
            #print("Stacking array {}".format(k))
            
#             arrays = np.array(arrays)
#             lens = np.array([len(i) for i in arrays])

#             # Mask of valid places in each row
#             mask = np.arange(lens.max()) < lens[:,None]

#             # Setup output array and put elements from data into masked positions
#             out = np.zeros(mask.shape, dtype=arrays.dtype)
#             out[mask] = np.vstack(arrays)
        
            #final[k] = np.array(arrays)
            
        y = np.zeros((self.batch_size, self.J+1))
        y[:,0] = 1

        return final, y
    
    def on_epoch_end(self):
        #np.random.shuffle(self.indicies)
        pass

In [None]:
generator = DataGenerator(data, 4, batch_size=1, split=range(0, 90000))

In [None]:
model.compile(loss="categorical_crossentropy", optimizer="adadelta", metrics=['accuracy'])

In [None]:
import gc
gc.collect()

In [None]:
model.fit_generator(generator=generator, epochs=20, workers=1)

In [None]:
1+1

In [None]:
parallel_model = multi_gpu_model(model, gpus=2)
parallel_model.compile(loss='categorical_crossentropy',
                       optimizer='adadelta')

In [None]:
def reset_weights(model):
    session = backend.get_session()
    for layer in model.layers: 
        if hasattr(layer, 'kernel_initializer'):
            layer.kernel.initializer.run(session=session)

In [None]:
reset_weights(model)

In [None]:
validation = DataGenerator(data, J, split=range(90000, 125000))

In [None]:
model.evaluate_generator(generator=validation)

In [None]:
for i in tqdm_notebook(range(len(data['claim']))):
    batch = {"claim":[], "positive_article":[], "negative_article_0":[], "negative_article_1":[], \
        "negative_article_2":[], "negative_article_3":[]}
    batch['claim'] = np.expand_dims(data['claim'][i].todense(), 0)
    batch['positive_article'] = np.expand_dims(data['positive_article'][i].todense(), 0)
    batch['negative_article_0'] = np.expand_dims(data['negative_article_0'][i].todense(), 0)
    batch['negative_article_1'] = np.expand_dims(data['negative_article_1'][i].todense(), 0)
    batch['negative_article_2'] = np.expand_dims(data['negative_article_2'][i].todense(), 0)
    batch['negative_article_3'] = np.expand_dims(data['negative_article_3'][i].todense(), 0)
    model.fit(batch, y)

In [None]:
model.fit(data, y)

In [None]:
article_set = set(article_list)

def process_claim(idx):
    data = {}
    data['claim'] = tokenize_claim(claims[idx], encoder)
    data['positive_article'] = tokenize_claim(article_list[idx], encoder)
    negative_articles = np.random.choice(list(article_set - set(claim_to_article[claims[idx]])), J)
    negative_articles = [tokenize_claim(i, encoder) for i in negative_articles]
    data['negative_article'] = negative_articles
    return data

In [None]:
process_claim(0)

In [None]:
np.argwhere(all_data[0]['claim'][0]==0)

In [None]:
model.inputs

In [None]:
model.fit()