In [None]:
import nltk
from glob import glob
import os
import json
import re
import numpy as np
import pickle
import string
from tqdm import tqdm 
from collections import defaultdict
from skopt import gp_minimize
from tensorflow.keras.models import model_from_json
import h5py
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.utils import plot_model

In [None]:
from utils import flatten, Sentence, Document, Claim, get_doc, load_claims, load_docs, CoxedFour, load_test_claims, preprocess

In [None]:
train_path = '../data/fever-data/train.jsonl'
dev_path = '../data/fever-data/dev.jsonl'
test_path = '../data/fever-data/test.jsonl'
doc_collections = glob('../data/wiki-pages/*.jsonl')
glove_path = '../data/glove/'

In [None]:
#create list of given claim ids to perform tf-idf
claim_ids_ten = [75397,150448,214861,156709,129629,33087,6744,226034,40190,76253]

In [None]:
def get_sentence(doc_id, sentence_id, doc_collections, doc_lookup):
    try:
      doc = get_doc(doc_id, doc_collections, doc_lookup)
    except KeyError:
      return ""
    sentences = doc.sentences
    return sentences[sentence_id]

In [None]:
with open('../pickle_jar/doc_lookup.pckl','rb') as f: 
    doc_lookup = pickle.load(f)

In [None]:
def get_preprocessed_features(claims_path,doc_lookup, doc_collections):
    claims = load_claims(claims_path, None)[:3000]

    #Create data: {claim: [words], evidence: [words]} is a single data point (i.e. claim)
    preprocessed_claims = []
    claims_tags = []
    labels = []
    preprocessed_evidence = []
    evidences_tags = []

    for claim in tqdm(claims):
        #print(claim.id)
        label = claim.truthfulness
        tokenized_claim = claim.sentence
        evidences = claim.evidence
        tokenized_evidence = flatten([get_sentence(evidence.source, 
                                                    evidence.string_id, doc_collections, doc_lookup) for evidence in evidences if evidence.source is not None])
        
        claim_tag = nltk.pos_tag(tokenized_claim)
        claim_tag = [tag[1] for tag in claim_tag]
        evidence_tag = nltk.pos_tag(tokenized_evidence)
        evidence_tag = [tag[1] for tag in evidence_tag]

        claims_tags.append(claim_tag)
        evidences_tags.append(evidence_tag)

        preprocessed_claims.append(tokenized_claim)
        preprocessed_evidence.append(tokenized_evidence)
        labels.append(label)


    labels = np.where(np.array(labels)=="true", 1, 0).reshape((-1,1))
    
    return preprocessed_claims, preprocessed_evidence, labels, claims_tags, evidences_tags

In [None]:
def get_vocab(word_lists):
  vocab = set()
  for word_list in word_lists:
    for words in word_list:
      for word in words: 
        vocab.add(word)
    
    
  vocab = list(vocab)    

  vocab_lookup = {}
  i = 1
  for word in vocab:
      vocab_lookup[word] = i
      i += 1
      
  return vocab_lookup, vocab

In [None]:
def get_maxlen(feature_list):
  maxlen = []
  for feature in feature_list: 
    maxlen.append(max(len(entry) for entry in feature))
  maxlen = np.max(maxlen)
  return maxlen

In [None]:
def convert_feature(maxlen, vocab_lookup, feature):
  
  map_feature = [list(map(lambda x: vocab_lookup[x], entry)) for entry in feature]
  
  for entry in map_feature: 
    entry += [0]*(maxlen - len(entry))
  
  map_feature = np.array(map_feature)
  return map_feature 

In [None]:
load_files = False

In [None]:
if load_files == True: 
  
  with open('../pickle_jar/preprocessed_claims','rb') as f:
    preprocessed_claims = pickle.load(f)
    
  with open('../pickle_jar/claims_tags','rb') as f:
    claims_tags = pickle.load(f)
    
  with open('../pickle_jar/dev_preprocessed_claims','rb') as f:
    dev_preprocessed_claims = pickle.load(f)
    
  with open('../pickle_jar/dev_claims_tags','rb') as f:
    dev_claims_tags = pickle.load(f)
    
  with open('../pickle_jar/preprocessed_evidence','rb') as f:
    preprocessed_evidence = pickle.load(f)
    
  with open('../pickle_jar/evidences_tags','rb') as f:
    evidences_tags = pickle.load(f)
    
  with open('../pickle_jar/dev_preprocessed_evidence','rb') as f:
    dev_preprocessed_evidence = pickle.load(f)
    
  with open('../pickle_jar/dev_evidences_tags','rb') as f:
    dev_evidences_tags = pickle.load(f)
    
  with open('../pickle_jar/tag_vocab_lookup','rb') as f:
    tag_vocab_lookup = pickle.load(f)
    
  with open('../pickle_jar/tag_vocab','rb') as f:
    tag_vocab = pickle.load(f)
    
  with open('../pickle_jar/vocab','rb') as f:
    vocab = pickle.load(f)
  
  with open('../pickle_jar/vocab_lookup','rb') as f:
    vocab_lookup = pickle.load(f)
   
  with open('../pickle_jar/maxlen_claims','rb') as f:
    maxlen_claims = pickle.load(f)
  
  with open('../pickle_jar/maxlen_evidence','rb') as f:
    maxlen_evidence = pickle.load(f)
    
  with open('../pickle_jar/labels','rb') as f:
    labels = pickle.load(f)
    
  with open('../pickle_jar/dev_labels','rb') as f:
    dev_labels = pickle.load(f)
    
  print('Files loaded.')
  
else: 
  preprocessed_claims, preprocessed_evidence, labels, claims_tags, evidences_tags = get_preprocessed_features(train_path,doc_lookup, doc_collections)
  dev_preprocessed_claims, dev_preprocessed_evidence, dev_labels, dev_claims_tags, dev_evidences_tags = get_preprocessed_features(dev_path,doc_lookup, doc_collections)
  
  vocab_lookup, vocab = get_vocab([preprocessed_claims, preprocessed_evidence,dev_preprocessed_claims, dev_preprocessed_evidence])
  tag_vocab_lookup, tag_vocab = get_vocab([claims_tags, evidences_tags, dev_claims_tags, dev_evidences_tags])
  
  maxlen_claims = get_maxlen([preprocessed_claims,dev_preprocessed_claims,claims_tags,dev_claims_tags])
  maxlen_evidence = get_maxlen([preprocessed_evidence,dev_preprocessed_evidence,evidences_tags,dev_evidences_tags])
  
  preprocessed_claims = convert_feature(maxlen_claims, vocab_lookup, preprocessed_claims)
  claims_tags = convert_feature(maxlen_claims, tag_vocab_lookup, claims_tags)
  dev_preprocessed_claims = convert_feature(maxlen_claims, vocab_lookup, dev_preprocessed_claims)
  dev_claims_tags = convert_feature(maxlen_claims, tag_vocab_lookup, dev_claims_tags)

  preprocessed_evidence = convert_feature(maxlen_evidence, vocab_lookup, preprocessed_evidence)
  evidences_tags = convert_feature(maxlen_evidence, tag_vocab_lookup, evidences_tags)
  dev_preprocessed_evidence = convert_feature(maxlen_evidence, vocab_lookup, dev_preprocessed_evidence)
  dev_evidences_tags = convert_feature(maxlen_evidence, tag_vocab_lookup, dev_evidences_tags)
  
  feature_list_str = ['preprocessed_claims', 'claims_tags', 
                'dev_preprocessed_claims', 'dev_claims_tags', 
                'preprocessed_evidence', 'evidences_tags', 
                'dev_preprocessed_evidence', 'dev_evidences_tags',
                'tag_vocab_lookup', 'tag_vocab', 
                'vocab', 'vocab_lookup',
                'maxlen_claims', 'maxlen_evidence',
                'labels','dev_labels']

  feature_list = [preprocessed_claims, claims_tags, 
                  dev_preprocessed_claims, dev_claims_tags, 
                  preprocessed_evidence, evidences_tags, 
                  dev_preprocessed_evidence, dev_evidences_tags,
                  tag_vocab_lookup, tag_vocab, 
                  vocab, vocab_lookup,
                  maxlen_claims, maxlen_evidence,
                  labels,dev_labels]


  for feature_str, feature in zip(feature_list_str, feature_list): 
      with open('../pickle_jar/{}'.format(feature_str),'wb') as f:
          pickle.dump(feature,f)
 

In [None]:
with open('../pickle_jar/dev_preprocessed_evidence','wb') as f:
  pickle.dump(dev_preprocessed_evidence,f)

# Import GloVE embeddings

In [None]:
dimensions = 50

embeddings_index = {}
f = open(os.path.join(glove_path, 'glove.6B.{}d.txt'.format(dimensions)))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((len(vocab_lookup) + 1, dimensions))
for word, i in vocab_lookup.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# Text classification with an RNN

## Create the model

Build a `tf.keras.Sequential` model. We have two input channels, one for the claims and one for the evidence (relevant sentences). Each input channel has two inputs, the sentences themselves and their POS tags. Each of these inputs first goes through an embedding layer. The POS tags go through a trainable embedding layer, whereas the sentences go through the GLOVE embedding layer. These are then concatenated and put through a BiLSTM (so there are two BiLSTMs running in parallel). Then, the output is concatenated and put through a dense layer before it is put through the final layer for two-way classification.

In [None]:
def construct_model(params, maxlen_claims, maxlen_evidence, dimensions,embedding_matrix):

    claims_input = tf.keras.Input(shape=[maxlen_claims], dtype=tf.int32)

    claims_output = tf.keras.Sequential([
        tf.keras.layers.Embedding(len(vocab)+1, dimensions, weights=[embedding_matrix], input_length=maxlen_evidence, trainable=False),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    ])(claims_input)

    evidence_input = tf.keras.Input(shape=[maxlen_evidence], dtype=tf.int32)
    evidence_output = tf.keras.Sequential([
        tf.keras.layers.Embedding(len(vocab)+1, dimensions, weights=[embedding_matrix], input_length=maxlen_evidence, trainable=False),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))
    ])(evidence_input)

    common_output = tf.keras.layers.Concatenate()([claims_output, evidence_output])

    final_output = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])(common_output)

    model = tf.keras.Model(inputs=[claims_input, evidence_input], outputs=final_output)
    
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    
    
    return model

In [None]:
def run_model(params, 
              maxlen_claims = maxlen_claims, maxlen_evidence = maxlen_evidence, 
              dimensions = dimensions, embedding_matrix = embedding_matrix, 
              return_model = False):
    
    model = construct_model(params, maxlen_claims, maxlen_evidence, dimensions,embedding_matrix)
    
    history = model.fit([preprocessed_claims, claims_tags, preprocessed_evidence, evidences_tags], 
                    labels,
                    validation_data=([dev_preprocessed_claims, dev_claims_tags, dev_preprocessed_evidence, evidences_tags], dev_labels),
                    epochs=params[6])

    if return_model:     
        
        return model, history.history
        
    else: 
      
        return history.history['val_loss'][-1]


In [None]:
params = [
    # Dimensions for embedding claims tags 
    35,
    # Hidden layers in claims BiLSTM
    31,
    # Dimensions for embedding evidence tags
    34,
    # Hidden layers in evidence BiLSTM
    23,
    # Dense layer
    97, 
    # Dropout rate
    0.34,
    # Epochs
    7
    
]

## Train the model

In [None]:
# Set only one of the below to be true
load_model = True
run_final_model = False

In [None]:
if load_model == True: 
  model = construct_model(params, maxlen_claims, maxlen_evidence, dimensions,embedding_matrix)
  model.load_weights("../models/model_q8.h5")
  print('Model loaded.')
  
elif run_final_model == True: 
  model, history = run_model(params,
                    maxlen_claims = maxlen_claims, maxlen_evidence = maxlen_evidence, 
                    dimensions = dimensions, embedding_matrix = embedding_matrix,
                    return_model = True)
  
  # serialize weights to HDF5
  model.save_weights("../models/model_q8.h5")
  print("Saved weights to disk")
  

## Performance plots

In [None]:
if run_final_model:
  plt.figure(figsize=(15, 4))
  plt.subplot(1,2,1)

  plt.plot(model.history.history['loss'])
  plt.plot(model.history.history['val_loss'])
  plt.xlabel("Epochs")
  plt.ylabel('loss')
  plt.legend(['loss', 'val_loss'])

  plt.subplot(1, 2, 2)
  plt.plot(model.history.history['acc'])
  plt.plot(model.history.history['val_acc'])
  plt.xlabel("Epochs")
  plt.ylabel('acc')
  plt.legend(['acc', 'val_acc'])

  plt.tight_layout()
  plt.savefig('plots_q8.png')
  plt.show()

In [None]:
plot_model(model, to_file='../output_files/model_arch_q8.png', show_shapes=True)