# Load metadata
This assumes that the file train.zip has been unzipped in the current directory. If needed, insert code here to load the data from your computer.

In [27]:
import json
import numpy as np
from google.colab import drive

drive.mount('/content/drive/')
# !unzip "/content/drive/My Drive/Colab Notebooks/train.zip"
# !unzip "/content/drive/My Drive/Colab Notebooks/fr-en.zip"
!# load metadata
with open("train.json", 'r') as f:
    metadata = json.load(f)
n_claims = len(metadata)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Preprocess articles

This code preprocesses the aticles to extract the top 5 sentences with greatest similarity to the claim according to tf-idf.

In [28]:
def preprocess_articles():

    from nltk.tokenize import sent_tokenize
    import nltk
    nltk.download('punkt')
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    # load metadata
    with open("train.json", 'r') as f:
        metadata = json.load(f)
    n_claims = 1#len(metadata)

    # load related articles for each claim
    relevant_sentences = []
    for id in range(n_claims):
  
        if id % 500 == 0:
            print("Claims preprocessed: ",id)
        
        # retrieve related articles
        related_articles = metadata[id]['related_articles']
        articles = ""
        for article_id in related_articles:
            filename = "train_articles/" + str(article_id) + ".txt"
            # concatenate related articles
            with open(filename, 'r') as text_file:
                text = text_file.read()
                articles = articles + "\n" + text

        # split articles into sentences
        sentences = sent_tokenize(articles)

        # append claim to articles
        sentences.append(metadata[id]['claim'])

        # vectorize sentences based on tf-idf
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(sentences)
    
        # measure similarity between claim and each sentence
        similarity =  X[-1,:] @ np.transpose(X[:-2,:])
        similarity = similarity.todense()

        # find top 5 sentences with greatest similarity
        sorted_index = np.argsort(similarity)
        top_sentences = []
        for i in range(1,min(5,sorted_index.shape[1])+1):
            top_sentences.append(sentences[sorted_index[0,-i]])
        relevant_sentences.append(top_sentences)

   
    return metadata, relevant_sentences

metadata, relevant_sentences = preprocess_articles()
print("Metadata of claim 0:")
print(metadata[0]['claim'])
print("Metadata of claimant 0:")
print(metadata[0]['claimant'])
print("Relevant sentences of claim 0:")
print(relevant_sentences[0])
# print(relevant_sentences[0][0])
# print(relevant_sentences[0][1])
# print(relevant_sentences[0][2])
# print(relevant_sentences[0][3])
# print(relevant_sentences[0][4])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Claims preprocessed:  0
Metadata of claim 0:
A line from George Orwell's novel 1984 predicts the power of smartphones.
Metadata of claimant 0:

Relevant sentences of claim 0:
['1984 by George Orwell\n1984 is a dystopian novel by English author George Orwell published in 1949.', 'Theater Review: \'1984\'\nEarly this year, sales of George Orwell\'s novel "1984" spiked after the words "alternative facts" entered the lexicon.', 'It is truly frightening to see the parallels between George Orwell\'s dystopian novel "1984" and the state of our union today.', '\n1984: George Orwell predicted 2017 almost 70 years ago\nApril, 1984.', 'The line is from one of the characters that works for the Government, otherwise known as Big Brother.']


# Sample a claim
Sample a claim, tokenize it and embed it.

In [29]:
!pip install bpemb
from bpemb import BPEmb
n_embedding_dims = 50
bpemb_en = BPEmb(lang="en", dim=n_embedding_dims)

def sampleClaim(metadata): 
#     id = random.randint(0, len(metadata) - 1)
    id = random.randint(0, len(relevant_sentences) - 1)
    claim = metadata[id]["claim"]
    claimant = metadata[id]["claimant"]
    label = metadata[id]["label"]
    
    if (len(relevant_sentences[id]) != 5): return "SKIP", "SKIP", "SKIP", "SKIP", "SKIP", "SKIP", "SKIP", "SKIP", "SKIP", "SKIP"

    sentence_0 = relevant_sentences[id][0]
    sentence_1 = relevant_sentences[id][1]
    sentence_2 = relevant_sentences[id][2]
    sentence_3 = relevant_sentences[id][3]
    sentence_4 = relevant_sentences[id][4]
    
        
    embedding_claim = bpemb_en.embed(claim)
    embedding_claimant = bpemb_en.embed(claimant)
    embedded_sentence_0 = bpemb_en.embed(sentence_0)
    embedded_sentence_1 = bpemb_en.embed(sentence_1)
    embedded_sentence_2 = bpemb_en.embed(sentence_2)
    embedded_sentence_3 = bpemb_en.embed(sentence_3)
    embedded_sentence_4 = bpemb_en.embed(sentence_4)
    
    
    embedding_claim = np.reshape(embedding_claim,(embedding_claim.shape[0],1,embedding_claim.shape[1]))
    embedding_claimant = np.reshape(embedding_claimant,(embedding_claimant.shape[0],1,embedding_claimant.shape[1]))
    embedded_sentence_0 = np.reshape(embedded_sentence_0,(embedded_sentence_0.shape[0],1,embedded_sentence_0.shape[1]))
    embedded_sentence_1 = np.reshape(embedded_sentence_1,(embedded_sentence_1.shape[0],1,embedded_sentence_1.shape[1]))
    embedded_sentence_2 = np.reshape(embedded_sentence_2,(embedded_sentence_2.shape[0],1,embedded_sentence_2.shape[1]))
    embedded_sentence_3 = np.reshape(embedded_sentence_3,(embedded_sentence_3.shape[0],1,embedded_sentence_3.shape[1]))
    embedded_sentence_4 = np.reshape(embedded_sentence_4,(embedded_sentence_4.shape[0],1,embedded_sentence_4.shape[1]))
    
    label = metadata[id]["label"]
    label_tensor = torch.tensor([label], dtype=torch.long)
    
    claim_tensor = torch.tensor(embedding_claim, dtype=torch.float)
    claimant_tensor = torch.tensor(embedding_claimant, dtype=torch.float)
    embedded_sentence_0 = torch.tensor(embedded_sentence_0, dtype=torch.float)
    embedded_sentence_1 = torch.tensor(embedded_sentence_1, dtype=torch.float)
    embedded_sentence_2 = torch.tensor(embedded_sentence_2, dtype=torch.float)
    embedded_sentence_3 = torch.tensor(embedded_sentence_3, dtype=torch.float)
    embedded_sentence_4 = torch.tensor(embedded_sentence_4, dtype=torch.float)
    
    claim_claimant_tensor = torch.cat((claim_tensor, claimant_tensor), 0)
    
    claim_claimant_sentences_tensor = torch.cat((claim_tensor, claimant_tensor, embedded_sentence_0,
                                                embedded_sentence_1, embedded_sentence_2, embedded_sentence_3,
                                                embedded_sentence_4), 0)
    
    return claim_tensor, claim_claimant_tensor, claim_claimant_sentences_tensor, label_tensor, claim, claimant, sentence_0,\
            [], label, id

Collecting bpemb
  Downloading https://files.pythonhosted.org/packages/bc/70/468a9652095b370f797ed37ff77e742b11565c6fd79eaeca5f2e50b164a7/bpemb-0.3.0-py3-none-any.whl
Collecting sentencepiece (from bpemb)
[?25l  Downloading https://files.pythonhosted.org/packages/00/95/7f357995d5eb1131aa2092096dca14a6fc1b1d2860bd99c22a612e1d1019/sentencepiece-0.1.82-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 7.9MB/s 
Installing collected packages: sentencepiece, bpemb
Successfully installed bpemb-0.3.0 sentencepiece-0.1.82
downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.model


100%|██████████| 400869/400869 [00:00<00:00, 553702.89B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.d50.w2v.bin.tar.gz


100%|██████████| 1924908/1924908 [00:01<00:00, 1915305.23B/s]
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
import torch
import torch.nn as nn

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        x = self.dropout(nn.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

In [0]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

# Embedder

In [0]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)

In [0]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 80):
        super().__init__()
        self.d_model = d_model
        
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** (i / d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** (i / d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    
    def forward(self, x):
        # make embeddings relatively larger
        x = x * math.sqrt(self.d_model)
        #add constant to embedding
        seq_len = x.size(1)
        x = x + Variable(self.pe[:,:seq_len], \
        requires_grad=False).cuda()
        return x

# Multi-Headed Attention

In [0]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0)
        
        # perform linear operation and split into h heads
        
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        # transpose to get dimensions bs * h * sl * d_model
       
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
# calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous()\
        .view(bs, -1, self.d_model)
        
        output = self.out(concat)
    
        return output

In [0]:
# build an encoder layer with one multi-head attention layer and one # feed-forward layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x
    
# We can then build a convenient cloning function that can generate multiple layers:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [0]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(EncoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    def forward(self, src):
        x = self.embed(src)
        x = self.pe(x)
#         for i in range(N):
#             x = self.layers[i](x, mask)
        return self.norm(x)
    

In [0]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads)
        self.decoder = Decoder(trg_vocab, d_model, N, heads)
        self.out = nn.Linear(d_model, trg_vocab)
    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output
# we don't perform softmax on the output as this will be handled 
# automatically by our loss function

# Training procedure

In [0]:


def train(category_tensor, line_tensor, update=True):
    rnnOptimizer.zero_grad()
    classifierOptimizer.zero_grad()

    hidden = rnn.initHidden()
    output = model(line_tensor)

    loss = criterion(output, category_tensor)
    if update:
        loss.backward()
        rnnOptimizer.step()
        classifierOptimizer.step()
        
    return output, loss.item()

## Training

In [39]:
!pip install vocab
import vocab
import spacy
import torchtext
from torchtext.data import Field, BucketIterator, TabularDataset
from sklearn.model_selection import train_test_split

import pandas as pd
europarl_en = open('/europarl-v7.fr-en.en', encoding='utf-8').read().split('\n') # remember to upload each time
# europarl_fr = open('/europarl-v7.fr-en.fr', encoding='utf-8').read().split('\n')
raw_data = {'English' : [line for line in europarl_en]}
df = pd.DataFrame(raw_data, columns=["English"])
# remove very long sentences and sentences where translations are 
# not of roughly equal length
df['eng_len'] = df['English'].str.count(' ')
# df['fr_len'] = df['French'].str.count(' ')
# df = df.query('fr_len < 80 & eng_len < 80')
# df = df.query('fr_len < eng_len * 1.5 & fr_len * 1.5 > eng_len')

en = spacy.load('en')
def tokenize_en(sentence):
    return [tok.text for tok in en.tokenizer(sentence)]
train, val = train_test_split(df, test_size=0.1)
train.to_csv("train.csv", index=False)
val.to_csv("val.csv", index=False)
# associate the text in the 'English' column with the EN_TEXT field, # and 'French' with FR_TEXT
data_fields = [('English', EN_TEXT)]
train,val = TabularDataset.splits(path='./', train='train.csv', validation='val.csv', format='csv', fields=data_fields)
EN_TEXT.build_vocab(train, val)

EN_TEXT = Field(tokenize=tokenize_en)

d_model = 512
heads = 8
N = 6
train_iter = 1000

src_vocab = len(EN_TEXT.vocab)
model = Encoder(src_vocab, d_model, N, heads)


for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
# this code is very important! It initialises the parameters with a
# range of values that stops the signal fading or getting too big.
# See this blog for a mathematical explanation.

optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)


def train_model(epochs, print_every=100):
    
    model.train()
    
    start = time.time()
    temp = start
    
    total_loss = 0
    
    for epoch in range(epochs):
       
        for iter in range(1, train_iter + 1):
            src = batch.English.transpose(0,1)
            
            train_claim_tensor, train_claim_claimant_tensor, train_claim_claimant_sentences_tensor,\
            label_tensor, claim_train, claimant, sentence_0,\
            sentences, label_train, id = sampleClaim(train_data)
            if (id == "SKIP"): continue

            # just claim
            train_output, train_loss = train(label_tensor, train_claim_tensor)    
            top_train_value, top_train_index = train_output.topk(1)
            train_guess_category = top_train_index[0].item()
            train_cumulative_loss += train_loss
            train_accuracy = 1 if train_guess_category == label_train else 0
            average_train_accuracy = (average_train_accuracy * count + train_accuracy) / (count+1)

            # claim and claimant
            train_output_claim_claimant, train_loss_claim_claimant = train(label_tensor, train_claim_claimant_tensor)
            top_train_value_claim_claimant, top_train_index_claim_claimant = train_output_claim_claimant.topk(1)
            train_guess_category_claim_claimant = top_train_index_claim_claimant[0].item()
            train_cumulative_loss_claim_claimant += train_loss_claim_claimant
            train_accuracy_claim_claimant = 1 if train_guess_category_claim_claimant == label_train else 0
            average_train_accuracy_claim_claimant = (average_train_accuracy_claim_claimant * count + train_accuracy_claim_claimant) / (count+1)

            # claim and claimant and sentences
            train_output_claim_claimant_sentences, train_loss_claim_claimant_sentences = train(label_tensor, train_claim_claimant_sentences_tensor)
            top_train_value_claim_claimant_sentences, top_train_index_claim_claimant_sentences = train_output_claim_claimant_sentences.topk(1)
            train_guess_category_claim_claimant_sentences = top_train_index_claim_claimant_sentences[0].item()
            train_cumulative_loss_claim_claimant_sentences += train_loss_claim_claimant_sentences
            train_accuracy_claim_claimant_sentences = 1 if train_guess_category_claim_claimant_sentences == label_train else 0
            average_train_accuracy_claim_claimant_sentences = (average_train_accuracy_claim_claimant_sentences * count + train_accuracy_claim_claimant) / (count+1)

            # separate train and test line
            test_claim_tensor, test_claim_claimant_tensor, test_claim_claimant_sentences_tensor,\
            label_tensor, claim_test, claimant, sentence_0,\
            sentences, label_test, id = sampleClaim(test_data)
            if (id == "SKIP"): continue

            # just claim
            test_output, test_loss = train(label_tensor, test_claim_tensor, update=False)
            top_test_value, top_test_index = test_output.topk(1)
            test_guess_category = top_test_index[0].item()
            test_cumulative_loss += test_loss
            test_accuracy = 1 if test_guess_category == label_test else 0
            average_test_accuracy = (average_test_accuracy * count + test_accuracy) / (count+1)

            # claim and claimant
            test_output_claim_claimant, test_loss_claim_claimant = train(label_tensor, test_claim_claimant_tensor, update=False)
            top_test_value_claim_claimant, top_test_index_claim_claimant = test_output_claim_claimant.topk(1)
            test_guess_category_claim_claimant = top_test_index_claim_claimant[0].item()
            test_cumulative_loss_claim_claimant += test_loss_claim_claimant
            test_accuracy_claim_claimant = 1 if test_guess_category_claim_claimant == label_test else 0
            average_test_accuracy_claim_claimant = (average_test_accuracy_claim_claimant * count + test_accuracy_claim_claimant) / (count+1)

            # claim and claimant and sentences
            test_output_claim_claimant_sentences, test_loss_claim_claimant_sentences = train(label_tensor, test_claim_claimant_sentences_tensor, update=False)
            top_test_value_claim_claimant_sentences, top_test_index_claim_claimant_sentences = test_output_claim_claimant_sentences.topk(1)
            test_guess_category_claim_claimant_sentences = top_test_index_claim_claimant_sentences[0].item()
            test_cumulative_loss_claim_claimant_sentences += test_loss_claim_claimant_sentences
            test_accuracy_claim_claimant_sentences = 1 if test_guess_category_claim_claimant_sentences == label_test else 0
            average_test_accuracy_claim_claimant_sentences = (average_test_accuracy_claim_claimant_sentences * count + test_accuracy_claim_claimant_sentences) / (count+1)
            count += 1
            
            # Add current loss avg to list of losses
            if iter % plot_every == 0:
                train_correct = '✓' if train_guess_category == label else '✗ (%s)' % label
                print('Train: %d  %d%% (%s) average_accuracy=%.4f average_loss=%.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), average_train_accuracy, train_cumulative_loss / plot_every, claim_train, train_guess_category, train_correct))
                test_correct = '✓' if test_guess_category == label else '✗ (%s)' % label
                print('Test: %d  %d%% (%s) average_accuracy=%.4f average_loss=%.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), average_test_accuracy, test_cumulative_loss / plot_every, claim_test, test_guess_category, test_correct))
                all_train_losses.append(train_cumulative_loss / plot_every)
                all_train_accuracies.append(average_train_accuracy)
                all_test_losses.append(test_cumulative_loss / plot_every)
                all_test_accuracies.append(average_test_accuracy)

                all_train_losses_claim_claimant.append(train_cumulative_loss_claim_claimant / plot_every)
                all_train_accuracies_claim_claimant.append(average_train_accuracy_claim_claimant)
                all_test_losses_claim_claimant.append(test_cumulative_loss_claim_claimant / plot_every)
                all_test_accuracies_claim_claimant.append(average_test_accuracy_claim_claimant)

                all_train_losses_claim_claimant_sentences.append(train_cumulative_loss_claim_claimant_sentences / plot_every)
                all_train_accuracies_claim_claimant_sentences.append(average_train_accuracy_claim_claimant_sentences)
                all_test_losses_claim_claimant_sentences.append(test_cumulative_loss_claim_claimant_sentences / plot_every)
                all_test_accuracies_claim_claimant_sentences.append(average_test_accuracy_claim_claimant_sentences)

                train_cumulative_loss = 0
                average_train_accuracy = 0
                test_cumulative_loss = 0
                average_test_accuracy = 0

                train_cumulative_loss_claim_claimant = 0
                average_train_accuracy_claim_claimant = 0
                test_cumulative_loss_claim_claimant = 0
                average_test_accuracy_claim_claimant = 0

                train_cumulative_loss_claim_claimant_sentences = 0
                average_train_accuracy_claim_claimant_sentences = 0
                test_cumulative_loss_claim_claimant_sentences = 0
                average_test_accuracy_claim_claimant_sentences = 0
                count = 0




AttributeError: ignored

In [0]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

f1 = plt.figure()
ax1 = f1.add_subplot(111)
ax1.set_title('Training Accuracy')
ax1.plot(all_train_accuracies)  # i, ii, iii accuracies 
ax1.plot(all_train_accuracies_claim_claimant) 
ax1.plot(all_train_accuracies_claim_claimant_sentences) 
ax1.set_ylabel('Accuracy')
ax1.set_xlabel('Epoch')
ax1.legend(['Claim only', 'Claim and claimant', 'Claim, the claimant and the 5 sentences'], loc='upper left')
f1.savefig("q2b_train.png")

f2 = plt.figure()
ax2 = f2.add_subplot(111)
ax2.set_title('Test Accuracy')
ax2.plot(all_test_accuracies) 
ax2.plot(all_test_accuracies_claim_claimant) 
ax2.plot(all_test_accuracies_claim_claimant_sentences) 
ax2.set_ylabel('Accuracy')
ax2.set_xlabel('Epoch')
ax2.legend(['Claim only', 'Claim and claimant', 'Claim, the claimant and the 5 sentences'], loc='upper left')
f2.savefig("q2b_test.png")