# Dataset pre-processing

## Imports

In [None]:
import os
import requests
import zipfile

import re
from functools import reduce
import pandas as pd
import numpy as np

import gensim
import gensim.downloader as gloader

import scipy
import gc
import time

from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [None]:
from keras.layers import Masking
from keras.layers import Bidirectional
from keras.layers import LSTM
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
from keras.optimizers import SGD
from keras.layers import Input
from keras.layers import Dropout
from keras.layers import Reshape
from keras.layers import Masking
from keras.layers import GlobalAveragePooling1D
from keras.layers import Bidirectional
from keras.layers import TimeDistributed
from keras.layers import RepeatVector
from keras.layers import concatenate
from keras.layers import Average
from keras.layers import Add
from keras.layers import Lambda
from keras import Model
from keras.utils import plot_model
from sklearn.metrics import classification_report
from keras.callbacks import EarlyStopping
from scipy.spatial.distance import cosine 

## Preprocessing

In [None]:
def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                   params={'id': toy_data_url_id},
                                   stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")

download_data('dataset')

Downloading FEVER data splits...
Download completed!
Extracting dataset...
Extraction completed!


In [None]:
cwd = os.getcwd()
train_df = pd.read_csv(cwd + "/dataset/train_pairs.csv")
val_df = pd.read_csv(cwd + "/dataset/val_pairs.csv")
test_df = pd.read_csv(cwd + "/dataset/test_pairs.csv")

#drop first column
train_df = train_df.drop(train_df.columns[0], axis=1)
val_df = val_df.drop(val_df.columns[0], axis=1)
test_df = test_df.drop(test_df.columns[0], axis=1)

#Text cleaning: removing tags from evidence

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;.:`\-\'\"]')
GOOD_SYMBOLS_RE = re.compile('[^0-9a-zA-Z #+_\|@,;.:`\-\'\"\\\/]')
REMOVE_SB_TAGS = re.compile('-LSB-(.*?)-RSB-')
REMOVE_RB_TAGS = re.compile('-LRB-|-RRB-')
TEXT_IN_PARS = re.compile('-LRB-(.*?)-RRB-')

def replace_double_apix(text):
  return text.replace("''", '"')

def replace_special_characters(text):
  return REPLACE_BY_SPACE_RE.sub(' ', text)

def remove_SB_text(text):
  return REMOVE_SB_TAGS.sub('', text)

def filter_out_uncommon_symbols(text):
  return GOOD_SYMBOLS_RE.sub('', text)

def handle_parentheses(text):
  sentences = re.findall(TEXT_IN_PARS, text)
  for sent in sentences: 
    if re.search(GOOD_SYMBOLS_RE, sent) is not None:
      text = TEXT_IN_PARS.sub('', text, 1)
    else:
      text = REMOVE_RB_TAGS.sub('', text, 2)
  return text
    
def strip_text(text):
  return " ".join(text.split())

def lower_text(text):
  return text.lower()

def replace_genitivo(text):
  return text.replace("'s", " 's")


PREPROCESSING_PIPELINE = [
                          replace_double_apix,
                          remove_SB_text,
                          handle_parentheses,
                          replace_special_characters,
                          replace_genitivo,
                          strip_text,
                          lower_text
                          ]

def text_prepare(text, filter_methods=PREPROCESSING_PIPELINE):
    return reduce(lambda txt, f: f(txt), filter_methods, text)

def clean_evidence_texts(df):
  df['Evidence'] = df['Evidence'].apply(lambda x: x.split('\t')[1])
  df['Evidence'] = df['Evidence'].apply(lambda txt: text_prepare(txt))
  df['Evidence'] = df['Evidence'].apply(lambda x: x.split())

def clean_claim_texts(df):
  df['Claim'] = df['Claim'].apply(lambda txt: text_prepare(txt))
  df['Claim'] = df['Claim'].apply(lambda x: x.split())

clean_evidence_texts(train_df)
clean_evidence_texts(val_df)
clean_evidence_texts(test_df)
clean_claim_texts(train_df)
clean_claim_texts(val_df)
clean_claim_texts(test_df)

print('Training set shape:', train_df.shape)
print('Validation set shape:', val_df.shape)
print('Test set shape:', test_df.shape)

Training set shape: (121740, 4)
Validation set shape: (7165, 4)
Test set shape: (7189, 4)


# Dataset conversion

In [None]:
embedding_dimension = 300
download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
try:
  embedding_model = gloader.load(download_path)
except ValueError as e:
  print("Invalid embedding model name! Check the embedding dimension:")
  print("Glove: 50, 100, 200, 300")
  raise e



In [None]:
def build_vocabulary(corpus):

  wordlist = []
  for x in corpus:
    wordlist.extend(x)
  words = set(wordlist)
  word_vocab = {}
  inverse_word_vocab = {}
  for i, word in enumerate(words):
    word_vocab[i] = word
    inverse_word_vocab[word] = i
  
  return word_vocab, inverse_word_vocab, words


corpus = pd.concat([train_df['Claim'], train_df['Evidence'], val_df['Claim'], val_df['Evidence'], test_df['Claim'], test_df['Evidence']], ignore_index=True)
idx_to_word, word_to_idx, word_listing = build_vocabulary(corpus)
print(len(word_listing))

33939


In [None]:
def co_occurrence_count(corpus, idx_to_word, word_to_idx, window_size=1):

    data = []
    index_i = []
    index_j = []

    for _, words in corpus.iteritems():
      for j, word in enumerate(words[::]):
        start = max(0, j-window_size)
        end = min(len(words), j + window_size+1)
        sub_sentence = words[start:end]
        for w in sub_sentence:
          if word != w:
            data.append(1.)
            index_i.append(word_to_idx[word])
            index_j.append(word_to_idx[w])
            
    co_occurrence = scipy.sparse.csr_matrix((data, (index_i, index_j)))

    return co_occurrence

window_size = 1

# Clean RAM before re-running this code snippet to avoid session crash
if 'co_occurrence_matrix' in globals():
    del co_occurrence_matrix
    gc.collect()
    time.sleep(10.)


print("Building co-occurrence count matrix... (it may take a while...)")
co_occurrence_matrix = co_occurrence_count(corpus, idx_to_word, word_to_idx, window_size)

print("Building completed!")

Building co-occurrence count matrix... (it may take a while...)
Building completed!


In [None]:
def check_OOV_terms(embedding_model, word_listing):
  OOV = [word for word in word_listing if word not in embedding_model.vocab]
  return OOV

oov_terms = check_OOV_terms(embedding_model, word_listing)
print("Total OOV terms: {0} ({1:.2f}%)".format(len(oov_terms), float(len(oov_terms)) / len(word_listing)*100))

Total OOV terms: 2342 (6.90%)


In [None]:
def build_embedding_matrix(embedding_model, embedding_dimension, word_to_idx, idx_to_word, co_occurrence_matrix):
   
    embedding_matrix = np.ndarray((len(word_to_idx), embedding_dimension))

    for w in word_to_idx:
      if w in embedding_model.vocab:
        embedding_matrix[word_to_idx[w], :] = embedding_model.get_vector(w)
      else:
        occurrences = co_occurrence_matrix[word_to_idx[w]]

        close_words = []
        for i, d in zip(occurrences.indices, occurrences.data):
        
          if idx_to_word[i] in embedding_model.vocab:
            close_words.append(embedding_model.get_vector(idx_to_word[i]) * d)
        if len(close_words) == 0:
          embedding_matrix[word_to_idx[w], :] = np.random.rand(1, embedding_dimension)
        else:
          embedding_matrix[word_to_idx[w], :] = np.average(close_words)

    return embedding_matrix 
  
embedding_matrix = build_embedding_matrix(embedding_model, embedding_dimension, word_to_idx, idx_to_word, co_occurrence_matrix)
print("Embedding matrix shape: {}".format(embedding_matrix.shape))

Embedding matrix shape: (33939, 300)


In [None]:
MAX_SENTENCE_LENGTH = max(len(x) for x in corpus)
print("Longest sentence: {} words".format(MAX_SENTENCE_LENGTH))

Longest sentence: 121 words


In [None]:
def embed_sentence(embedding_matrix, sentence, word_to_idx):
  embedded = []
  for w in sentence:
    embedded.append(embedding_matrix[word_to_idx[w]])
  return embedded

def embed_sentences(df, embedding_matrix, word_to_idx):
  df['Embedded claim'] = [embed_sentence(embedding_matrix, sentence, word_to_idx) for sentence in df['Claim']]
  df['Embedded evidence'] = [embed_sentence(embedding_matrix, sentence, word_to_idx) for sentence in df['Evidence']]
  
  label_encoder = LabelEncoder()
  label_encoder.fit(df['Label'])
  
  df['Embedded label'] =label_encoder.transform(df['Label'])


def dataframe_generator(df):
  generator = (([np.array(pad_sequences([df['Embedded claim'][i]], maxlen=MAX_SENTENCE_LENGTH, padding='post', dtype='float32')),
                np.array(pad_sequences([df['Embedded evidence'][i]], maxlen=MAX_SENTENCE_LENGTH, padding='post', dtype='float32'))],
                np.array(df['Embedded label'][i]).astype('float32').reshape((-1,1))) for i in range(len(df)))
  return generator

embed_sentences(train_df, embedding_matrix, word_to_idx)
embed_sentences(val_df, embedding_matrix, word_to_idx)
embed_sentences(test_df, embedding_matrix, word_to_idx)

train_df.head()

Unnamed: 0,Claim,Evidence,ID,Label,Embedded claim,Embedded evidence,Embedded label
0,"[chris, hemsworth, appeared, in, a, perfect, g...","[hemsworth, has, also, appeared, in, the, scie...",3,SUPPORTS,"[[0.43830999732017517, -0.22437000274658203, -...","[[0.2014700025320053, -0.5208799839019775, 0.2...",1
1,"[roald, dahl, is, a, writer]","[roald, dahl, 13, september, 1916, 23, novembe...",7,SUPPORTS,"[[0.4275299906730652, 0.20163999497890472, -0....","[[0.4275299906730652, 0.20163999497890472, -0....",1
2,"[roald, dahl, is, a, governor]","[roald, dahl, 13, september, 1916, 23, novembe...",8,REFUTES,"[[0.4275299906730652, 0.20163999497890472, -0....","[[0.4275299906730652, 0.20163999497890472, -0....",0
3,"[ireland, has, relatively, low, lying, mountains]","[the, island, s, geography, comprises, relativ...",9,SUPPORTS,"[[0.5416100025177002, 0.2409999966621399, 0.01...","[[0.046560000628232956, 0.21318000555038452, -...",1
4,"[ireland, does, not, have, relatively, low, ly...","[the, island, s, geography, comprises, relativ...",10,REFUTES,"[[0.5416100025177002, 0.2409999966621399, 0.01...","[[0.046560000628232956, 0.21318000555038452, -...",0


# Model definition

In [None]:
from keras import backend as K

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

## simpleLSTM

In [None]:
def simple_LSTM_encoder(input_shape, merging_technique, add_similarity):
  input_claim = Input(shape=input_shape)
  mask_claim = Masking(input_shape=input_shape, mask_value=0.0)(input_claim)
  drop_claim = Dropout(0.2)(mask_claim)
  lstm_claim = Bidirectional(LSTM(64, return_sequences=False, dropout=0.4))(drop_claim)
  
  input_evidence = Input(shape=input_shape)
  mask_evidence = Masking(input_shape=input_shape, mask_value=0.0)(input_evidence)
  drop_evidence = Dropout(0.2)(mask_evidence)
  lstm_evidence = Bidirectional(LSTM(64, return_sequences=False, dropout=0.4))(drop_evidence)

  if merging_technique == 'concatenation':
    merge = concatenate([lstm_claim, lstm_evidence])
  elif merging_technique == 'sum':
    merge = Add()([lstm_claim, lstm_evidence])
  elif merging_technique == 'average':
    merge = Average()([lstm_claim, lstm_evidence])

  if add_similarity:
    distance = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([lstm_claim, lstm_evidence])
    merge = concatenate([merge, distance])

  classificator = Dense(512, activation='relu')(merge)
  classificator = Dropout(0.2)(classificator)
  classificator = Dense(1, activation='sigmoid')(classificator)
  
  opt = Adam(learning_rate=0.001)
  
  model = Model(inputs=[input_claim, input_evidence], outputs=classificator)
  model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

  return model

## averageLSTM

In [None]:
def average_LSTM_encoder(input_shape, merging_technique, add_similarity):
  input_claim = Input(shape=input_shape)
  mask_claim = Masking(input_shape=input_shape, mask_value=0.0)(input_claim)
  drop_claim = Dropout(0.2)(mask_claim)
  lstm_claim = Bidirectional(LSTM(64, return_sequences=True, dropout=0.4))(drop_claim)
  average_claim = GlobalAveragePooling1D()(lstm_claim)
  
  input_evidence = Input(shape=input_shape)
  mask_evidence = Masking(input_shape=input_shape, mask_value=0.0)(input_evidence)
  drop_evidence = Dropout(0.2)(mask_evidence)
  lstm_evidence = Bidirectional(LSTM(64, return_sequences=True, dropout=0.4))(drop_evidence)
  average_evidence = GlobalAveragePooling1D()(lstm_evidence)
      
  if merging_technique == 'concatenation':
    merge = concatenate([average_claim, average_evidence])
  elif merging_technique == 'sum':
    merge = Add()([average_claim, average_evidence])
  elif merging_technique == 'average':
    merge = Average()([average_claim, average_evidence])

  if add_similarity:
    distance = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([average_claim, average_evidence])
    merge = concatenate([merge, distance])

  classificator = Dense(512, activation='relu')(merge)
  classificator = Dropout(0.2)(classificator)
  classificator = Dense(1, activation='sigmoid')(classificator)
  
  opt = Adam(learning_rate=0.001)
  
  model = Model(inputs=[input_claim, input_evidence], outputs=classificator)
  model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

  return model

## MLP

In [None]:
def MLP_encoder(input_shape, merging_technique, add_similarity):
  input_claim = Input(shape=input_shape)
  mask_claim = Masking(input_shape=input_shape, mask_value=0.0)(input_claim)
  drop_claim = Dropout(0.2)(mask_claim)
  reshape_claim = Reshape((input_shape[0] * input_shape[1],))(drop_claim)
  mlp_claim = Dense(512)(reshape_claim)

  input_evidence = Input(shape=input_shape)
  mask_evidence = Masking(input_shape=input_shape, mask_value=0.0)(input_evidence)
  drop_evidence = Dropout(0.2)(mask_evidence)
  reshape_evidence = Reshape((input_shape[0] * input_shape[1],))(drop_evidence)
  mlp_evidence = Dense(512)(reshape_evidence)
  
  if merging_technique == 'concatenation':
    merge = concatenate([mlp_claim, mlp_evidence])
  elif merging_technique == 'sum':
    merge = Add()([mlp_claim, mlp_evidence])
  elif merging_technique == 'average':
    merge = Average()([mlp_claim, mlp_evidence])

  if add_similarity:
    distance = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([lstm_claim, lstm_evidence])
    merge = concatenate([merge, distance])

  classificator = Dense(512, activation='relu')(merge)
  classificator = Dropout(0.2)(classificator)
  classificator = Dense(1, activation='sigmoid')(classificator)
  
  opt = Adam(learning_rate=0.001)
  
  model = Model(inputs=[input_claim, input_evidence], outputs=classificator)
  model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

  return model

## BOV

In [None]:
def BOV(input_shape, merging_technique, add_similarity):
  input_claim = Input(shape=input_shape)
  drop_claim = Dropout(0.2)(input_claim)
  average_claim = GlobalAveragePooling1D()(drop_claim)

  input_evidence = Input(shape=input_shape)
  drop_evidence = Dropout(0.2)(input_evidence)
  average_evidence = GlobalAveragePooling1D()(drop_evidence)

  if merging_technique == 'concatenation':
    merge = concatenate([average_claim, average_evidence])
  elif merging_technique == 'sum':
    merge = Add()([average_claim, average_evidence])
  elif merging_technique == 'average':
    merge = Average()([average_claim, average_evidence])

  if add_similarity:
    distance = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([lstm_claim, lstm_evidence])
    merge = concatenate([merge, distance])
  
  classificator = Dense(512, activation='relu')(merge)
  classificator = Dropout(0.2)(classificator)
  classificator = Dense(1, activation='sigmoid')(classificator)
  
  opt = Adam(learning_rate=0.001)
  
  model = Model(inputs=[input_claim, input_evidence], outputs=classificator)
  model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

  return model

# Training

In [None]:
def train(encoder, merging_technique, add_similarity):
  
  model = encoder((MAX_SENTENCE_LENGTH, embedding_dimension), merging_technique, add_similarity)
  
  n_epochs = 30
  steps_per_epoch = len(train_df) / n_epochs
  validation_steps = len(val_df) / n_epochs
  cb = EarlyStopping(monitor='val_accuracy', patience=n_epochs, restore_best_weights=True)

  train_generator = dataframe_generator(train_df)
  validation_generator = dataframe_generator(val_df)
  
  model.fit(train_generator, epochs=n_epochs, steps_per_epoch=steps_per_epoch,
            validation_data=validation_generator, validation_steps=validation_steps,
            callbacks=cb)
  
  return model

In [None]:
encoders = {
            'simpleLSTM': simple_LSTM_encoder,
            'averageLSTM': average_LSTM_encoder,
            'MLP': MLP_encoder,
            'BOV': BOV
}

merging_techniques = ['concatenation', 
                      'sum',
                      'average'
                      ]


y_test = test_df['Embedded label']

results = []

for encoder in encoders:
  print("=====================================================================================================================================")
  print("Embedding with", encoder)

  
  print("Merging using", 'average')
  classificator = train(encoders[encoder], 'average', add_similarity=False)
    
  print("--------------------------------------------------------------------------------------------------------------------------------------")
    
  test_generator = dataframe_generator(test_df)
  y_pred = np.around(classificator.predict(test_generator))
    
  results.append([encoder, 'average', classification_report(y_test, y_pred, zero_division=True)])

In [None]:
# evaluate best encoder using only average strategy

for report in results:
  print("Encoder:", report[0])
  print("Merging technique:", report[1])
  print(report[2])
  print("------------------------------------------------------")

In [None]:
print("=====================================================================================================================================")
print("Embedding with averageLSTM")

results_averageLSTM = []

for technique in merging_techniques:
    print("Merging using", technique)
    classificator = train(average_LSTM_encoder, technique, add_similarity=False)
    
    print("--------------------------------------------------------------------------------------------------------------------------------------")

    test_generator = dataframe_generator(test_df)
    
    y_pred = np.around(classificator.predict(test_generator))

    results_averageLSTM.append(['averageLSTM', technique, classification_report(y_test, y_pred, zero_division=True)])

In [None]:
# evaluate best technique fot best encoder

for report in results_averageLSTM:
  print("Encoder:", report[0])
  print("Merging technique:", report[1])
  print(report[2])
  print("------------------------------------------------------")

In [None]:
BEST_ENCODER = average_LSTM_encoder
BEST_TECHNIQUE = 'average'

print("Embedding with average LSTM")
print("Merging using", BEST_TECHNIQUE)
print("Adding cosine similarity")
classificator = train(BEST_ENCODER, BEST_TECHNIQUE, add_similarity=True)
test_generator = dataframe_generator(test_df)
y_pred = np.around(classificator.predict(test_generator))

Embedding with average LSTM
Merging using average
Adding cosine similarity
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# Evaluation

## Multi input classification evaluation

In [None]:
y_test = test_df['Embedded label']
print(classification_report(y_test, y_pred, zero_division=True))

              precision    recall  f1-score   support

           0       0.86      0.48      0.62      3583
           1       0.64      0.92      0.76      3606

    accuracy                           0.70      7189
   macro avg       0.75      0.70      0.69      7189
weighted avg       0.75      0.70      0.69      7189



## Claim verification evaluation

In [None]:
from scipy import stats

# Add prediction column to df
test_df['Prediction'] = y_pred.astype(int)
# collect predictions for each claim in a list
voting_df = test_df.groupby(['ID', 'Embedded label'])['Prediction'].apply(list).reset_index()
# find the mode in each prediction list
voting_df['Majority label'] = voting_df['Prediction'].apply(lambda x: stats.mode(x)[0][0])

y_grouped_test = voting_df['Embedded label']
y_majority_pred = voting_df['Majority label']

print(classification_report(y_grouped_test, y_majority_pred, zero_division=True))

              precision    recall  f1-score   support

           0       0.86      0.49      0.62      3304
           1       0.64      0.92      0.76      3309

    accuracy                           0.70      6613
   macro avg       0.75      0.70      0.69      6613
weighted avg       0.75      0.70      0.69      6613



# Comments/summary

### Dataset preprocessing
We cleaned a bit the texts removing parentheses, punctuation and strange characters.

### Dataset conversion
We embedded each word of claims and evidences using glove embedding and we treated OOV using the neighbour strategy.

### Model definition
We defined 4 different models:


*   A bidirectional LSTM which takes as input a sentence and return the last hidden state as sentence encoding
*   A bidirectional LSTM which takes as input a sentence and return the average of all the hidden states as sentence encoding
*   A simple MultiLayer Perceptron which takes as input a sentence and returns the output of the fully connected layer as the encoding of the sentence 
*   A BOV model which simply takes as input a sentence and returns the average of the glove embedding of all the words in the sentence

### Training
In the training, first all the models are tested using the same merging strategy: average. 

Then the best model is chosen and is tested with all the merging strategies.

In the end the best model is tested with the best strategy and with the cosine similarity concatenated to the input of the classifier.

The notebook we uploaded only shows results from the last run, since it was too long to run everything together.

### Evaluation
First we evaluated the results of the classifier looking only at the numbers of the pairs claim/evidence correctly classified.

Then we aggregated the pairs referring to the same claim and through a major voting we evaluated the quality of the classification in relation the claim.
