# Implementing CLSM

## Purpose
The purpose of this notebook is to implement Microsoft's [Convolutional Latent Semantic Model](http://www.iro.umontreal.ca/~lisa/pointeurs/ir0895-he-2.pdf) on our dataset.

## Inputs
- This notebook requires *wiki-pages* from the FEVER dataset as an input.

## Preprocessing Data

In [None]:
import cdssm
import numpy as np
import nltk
import utils
import pickle

from scipy import sparse
from joblib import Parallel, delayed
from multiprocessing import cpu_count
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from tqdm import tqdm_notebook

In [None]:
claims, labels, article_list, claim_set, claim_to_article = utils.extract_fever_jsonl_data("../train.jsonl")

Num Distinct Claims 109810
Num Data Points 125051


In [3]:
claim_to_article

{'Robert F. Kennedy defeated Senator Eugene McCarthy in the Ohio primaries.': ['robert f  kennedy'],
 'The Odyssey has been translated into ancient dialects.': ['odyssey'],
 'Big Bang won 7 awards at the 9th MTV Video Music Awards Japan.': ['big bang  south korean band '],
 'Priyanka Chopra is an Indian singer.': ['priyanka chopra',
  'priyanka chopra'],
 'Oz the Great and Powerful was released in conventional theaters 2016.': ['oz the great and powerful'],
 'Cosmopolitan contains content as of 2011 which includes articles on celebrities.': ['cosmopolitan  magazine '],
 'One Teen Choice Award was won by Anne Hathaway.': ['anne hathaway'],
 'Star Wars: Episode II – Attack of the Clones came out on Blu-ray disc.': ['star wars colon  episode ii – attack of the clones'],
 '24 premiered in November.': ['24  tv series '],
 'Jane Eyre was originally published as Jane Eyre: An Autobiography.': ['jane eyre'],
 'Emily Browning was passed over for a role in Sucker Punch.': ['emily browning'],
 'A

In [4]:
claims

['Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.',
 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.',
 'Roman Atwood is a content creator.',
 'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.',
 'Adrienne Bailon is an accountant.',
 'Homeland is an American television spy thriller based on the Israeli television series Prisoners of War.',
 'Homeland is an American television spy thriller based on the Israeli television series Prisoners of War.',
 'The Boston Celtics play their home games at TD Garden.',
 'The Ten Commandments is an epic film.',
 'Tetris has sold millions of physical copies.',
 'Cyndi Lauper won the Best New Artist award at the 27th Grammy Awards in 1985.',
 'There is a movie called The Hunger Games.',
 'Ryan Gosling has been to a country in Africa.',
 'Ryan Gosling has been to a country in Africa.',
 'Stranger Things is set in Bloomington

In [5]:
def generate_all_tokens(arr):
    all_tokens = []
    for unprocessed_claim in tqdm_notebook(arr):
        c = utils.preprocess_article_name(unprocessed_claim)
        c = "! {} !".format(c)
        for word in c.split():
            letter_tuples = list(nltk.ngrams("#" + word + "#", 3))
            letter_grams = []
            for l in letter_tuples:
                letter_grams.append("".join(l))
            all_tokens.extend(letter_grams)
    return all_tokens

In [6]:
processed_claims = generate_all_tokens(claims)
processed_claims.extend(generate_all_tokens(article_list))

possible_tokens = list(set(processed_claims))

encoder = LabelEncoder()
encoder.fit(np.array(sorted(possible_tokens)))

HBox(children=(IntProgress(value=0, max=125051), HTML(value='')))




HBox(children=(IntProgress(value=0, max=125051), HTML(value='')))




LabelEncoder()

In [7]:
feature_encoder = {}
for idx, e in tqdm_notebook(enumerate(encoder.classes_)):
    feature_encoder[e] = idx

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [8]:
def tokenize_claim(c, enc):
    """
    Input: a string that represents a single claim
    Output: a list of 3x|vocabulary| arrays that has a 1 where the letter-gram exists.
    """
    encoded_vector = []
    c = utils.preprocess_article_name(c)
    c = "! {} !".format(c)
    for ngram in nltk.ngrams(nltk.word_tokenize(c), 3):
        arr = sparse.lil_matrix((3, len(enc.__dict__['classes_'])))
        for idx, word in enumerate(ngram):
            for letter_gram in nltk.ngrams("#" + word + "#", 3):
                s = "".join(letter_gram)
                letter_idx = feature_encoder[s]
                arr[idx, letter_idx] = 1
        encoded_vector.append(arr)
    return encoded_vector

In [9]:
load_processed_claims = True

In [10]:
if load_processed_claims:
    with open("all_data.pkl", "rb") as f:
        all_data = pickle.load(f)
else:
    all_data = []

    article_set = set(article_list)

    def process_claim(idx):
        data = {}
        data['claim'] = tokenize_claim(claims[idx], encoder)
        data['positive_article'] = tokenize_claim(article_list[idx], encoder)
        negative_articles = np.random.choice(list(article_set - set(claim_to_article[claims[idx]])), cdssm.J)
        negative_articles = [tokenize_claim(i, encoder) for i in negative_articles]
        data['negative_article'] = negative_articles
        return data

    all_data = [process_claim(i) for i in tqdm_notebook(range(len(claims)))]
    
    with open("all_data.pkl", "wb") as f:
        pickle.dump(all_data, f)
    #all_data = Parallel(n_jobs=cpu_count(), verbose=1, prefer="threads")(delayed(process_claim)(i) for i in range(len(claims)))

In [11]:
import torch 
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.functional as F
import cdssm

In [12]:
model = cdssm.CDSSM()

In [13]:
def to_torch_sparse_tensor(M):
    M = M.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((M.row, M.col))).long().cuda()
    values = torch.from_numpy(M.data).cuda()
    shape = torch.Size(M.shape)
    T = torch.sparse.FloatTensor(indices, values, shape)
    return T

In [None]:
all_gpu_data = []
def process_gpu_data(i):
    data = {}
    data['claim'] = Variable(to_torch_sparse_tensor(sparse.vstack(all_data[i]['claim'])))
    data['positive_article'] = Variable(to_torch_sparse_tensor(sparse.vstack(all_data[i]['positive_article'])))
    data['negative_article'] = [_ for j in range(cdssm.J)]
    for j in range(cdssm.J):
        item = all_data[i]['negative_article'][j]
        if len(item)>0:
            data['negative_article'][j] = Variable(to_torch_sparse_tensor(sparse.vstack(all_data[i]['negative_article'][j])))
    return data

all_gpu_data = Parallel(n_jobs=cpu_count(), verbose=1, prefer="threads")(delayed(process_gpu_data)(i) for i in range(10000))

In [None]:
all_gpu_data[0]

In [14]:
all_gpu_data = []
for i in tqdm_notebook(range(1, len(all_data))):
    data = {}
    data['claim'] = Variable(to_torch_sparse_tensor(sparse.vstack(all_data[i]['claim'])))
    data['positive_article'] = Variable(to_torch_sparse_tensor(sparse.vstack(all_data[i]['positive_article'])))
    data['negative_article'] = [_ for j in range(cdssm.J)]
    for j in range(cdssm.J):
        item = all_data[i]['negative_article'][j]
        if len(item)>0:
            data['negative_article'][j] = Variable(to_torch_sparse_tensor(sparse.vstack(all_data[i]['negative_article'][j])))
    all_gpu_data.append(data)

HBox(children=(IntProgress(value=0, max=125049), HTML(value='')))




In [None]:
model

In [None]:
LETTER_GRAM_SIZE = 3 # See section 3.2.
WINDOW_SIZE = 3 # See section 3.2.
TOTAL_LETTER_GRAMS = len(encoder.__dict__['classes_']) # Determined from data. See section 3.2.
WORD_DEPTH = WINDOW_SIZE * TOTAL_LETTER_GRAMS # See equation (1).
# Uncomment it, if testing
# WORD_DEPTH = 1000
K = 300 # Dimensionality of the max-pooling layer. See section 3.4.
L = 128 # Dimensionality of latent semantic space. See section 3.5.
J = 4 # Number of random unclicked documents serving as negative examples for a query. See section 4.
FILTER_LENGTH = 1 # We only consider one time step for convolutions.

In [None]:
# Loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

In [None]:
# Build a random data set.
train_set_size = int(len(all_gpu_data) * 0.8)

for i in tqdm_notebook(range(train_set_size)):
    claim = all_gpu_data[i]['claim'].to_dense()
    claim = claim.view(1, claim.shape[0] // 3, 3, -1)
    
    positive_article = all_gpu_data[i]['positive_article'].to_dense()
    positive_article = positive_article.view(1, positive_article.shape[0] // 3, 3, -1)
    
    negative_articles = []
    for j in range(cdssm.J):
        negative_article = all_gpu_data[i]['negative_article'][j].to_dense()
        negative_article = negative_article.view(1, negative_article.shape[0]//3, 3, -1)
        negative_articles.append(negative_article)
    
    y_pred = model(claim, positive_article, negative_articles)
    
    y = Variable(torch.from_numpy(np.array(labels[i])).long())
    loss = criterion(y_pred.resize(1,cdssm.J+1), y)
    print (i, loss.data[0])
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()