In [None]:
#setup all imports required

import torch
import json
import csv
import torch.nn as nn
from transformers import RobertaForSequenceClassification, BertForSequenceClassification, RobertaConfig, BertConfig, RobertaTokenizer, BertTokenizer, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from collections import Counter
import gensim.downloader as api
from tqdm.auto import tqdm
from rank_bm25 import *
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_non_alphanum, strip_multiple_whitespaces, remove_stopwords, stem_text
import re
import spacy
import nltk

<b> NOTES:

!! If at any point kernel crashes but a pickling point have been passed, processing up to the last pickling point can be skipped and data loaded directly instead. A sample of how to load pre-processed data and scores is available at the bottom of this notebook under "Load Preprocessed Data and Scores". Adjust files and variable names to as required. !! 

!! Action points i.e. updating variables based on the model to be tested are noted in comments and markdowns. !!

</b>

# Part A: Evidence Selection

## Text Processing

Removal of non-alphanum characters, strip multiple white spaces, lemmatize

In [None]:
evidence_path = "project-data/evidence.json"
with open(evidence_path) as evidence_file:
    evidence_dict = json.load(evidence_file)

evidence_df = pd.DataFrame.from_dict(evidence_dict, orient = "index")
evidence_df.set_axis(["evidence"], axis = 1, inplace = True)

evidence_df.reset_index(inplace = True)

In [None]:
nlp = spacy.load("en_core_web_sm")
tqdm.pandas()

def preprocess(row):
    try:
        sentence = row["evidence"]
    except:
        sentence = row["claim_text"]
    CUSTOM_FILTERS = [strip_tags, strip_non_alphanum, strip_multiple_whitespaces, remove_stopwords]
    tokens = preprocess_string(sentence, CUSTOM_FILTERS)
    sentence = " ".join(tokens)
    doc = nlp(sentence)
    lemmatized_tokens = [token.lemma_ for token in doc]
    sentence = " ".join(lemmatized_tokens)
    sentence = sentence.lower()
    return sentence

In [None]:
evidence_df['evidence_clean'] = evidence_df.progress_apply (preprocess, axis=1)

In [None]:
evidence_df.to_csv('./pickles/evidence_df.csv')
with open('./pickles/evidence_df.pkl', 'wb') as f:
    pickle.dump(evidence_df, f)

In [None]:
dev_path = "project-data/dev-claims.json"
test_path = "project-data/test-claims-unlabelled.json"
train_path = "project-data/train-claims.json"

with open(dev_path) as dev_file:
    dev_dict = json.load(dev_file)
dev_df = pd.DataFrame.from_dict(dev_dict, orient = "index")
dev_df.reset_index(inplace = True)

with open(test_path) as test_file:
    test_dict = json.load(test_file)
test_df = pd.DataFrame.from_dict(test_dict, orient = "index")
test_df.reset_index(inplace = True)

with open(train_path) as train_file:
    train_dict = json.load(train_file)
train_df = pd.DataFrame.from_dict(train_dict, orient = "index")
train_df.reset_index(inplace = True)

In [None]:
dev_df['clean'] = dev_df.progress_apply (preprocess, axis=1)
test_df['clean'] = test_df.progress_apply (preprocess, axis=1)
train_df['clean'] = train_df.progress_apply (preprocess, axis=1)

## Get BM-25 (bm25) scores, TF-IDF (tfidf), and word2Vec (w2v) cosine similarities with all evidences

In [None]:
evidence_tokens = evidence_df['evidence_clean'].apply(lambda x: x.split())

In [None]:
#setup bm25, tfidf, and w2v
vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize)
n_components = 100 
model_w2v = api.load("word2vec-google-news-300")

#get evidence vectors
vectorizer.fit(evidence_df['evidence_clean'])
evidence_tfidf = vectorizer.transform(evidence_df['evidence_clean'])
bm25_vectors = BM25Okapi(evidence_tokens, b=0.3, k1=0.5)

def w2vembedding(tokens, model):
    embeddings = [model[token] for token in tokens if token in model]
    if not embeddings:
        return np.zeros(model.vector_size)
    return np.mean(embeddings, axis=0)

evidence_w2v = np.array([w2vembedding(doc, model_w2v) for doc in evidence_tokens])

In [None]:
#train data
train_tokens = train_df['clean'].apply(lambda x: x.split())

train_tfidf = vectorizer.transform(train_df['clean'])
train_scores_tfidf = cosine_similarity(train_tfidf, evidence_tfidf)

train_scores_w2v = []
train_scores_bm25 = [] 
    
for token in tqdm(train_tokens, desc="Getting bm25 and w2v scores"):
    w2v = w2vembedding(token, model_w2v)
    similarity_score = cosine_similarity(w2v.reshape(1, -1), evidence_w2v)
    train_scores_w2v.append(similarity_score[0])
    bm25_scores = bm25_vectors.get_scores(token)                                
    train_scores_bm25.append(bm25_scores)
    
train_scores_w2v = np.array(train_scores_w2v)

In [None]:
#dev data
dev_tokens = dev_df['clean'].apply(lambda x: x.split())

dev_tfidf = vectorizer.transform(dev_df['clean'])
dev_scores_tfidf = cosine_similarity(dev_tfidf, evidence_tfidf)

dev_scores_w2v = []
dev_scores_bm25 = [] 
    
for token in tqdm(dev_tokens, desc="Getting bm25 and w2v scores"):
    w2v = w2vembedding(token, model_w2v)
    similarity_score = cosine_similarity(w2v.reshape(1, -1), evidence_w2v)
    dev_scores_w2v.append(similarity_score[0])
    bm25_scores = bm25_vectors.get_scores(token)                                
    dev_scores_bm25.append(bm25_scores)

dev_scores_w2v = np.array(dev_scores_w2v)

In [None]:
#test data
test_tokens = test_df['clean'].apply(lambda x: x.split())

test_tfidf = vectorizer.transform(test_df['clean'])
test_scores_tfidf = cosine_similarity(test_tfidf, evidence_tfidf)

test_scores_w2v = []
test_scores_bm25 = [] 
    
for token in tqdm(test_tokens, desc="Getting bm25 and w2v scores"):
    w2v = w2vembedding(token, model_w2v)
    similarity_score = cosine_similarity(w2v.reshape(1, -1), evidence_w2v)
    test_scores_w2v.append(similarity_score[0])
    bm25_scores = bm25_vectors.get_scores(token)                                
    test_scores_bm25.append(bm25_scores)
    
test_scores_w2v = np.array(test_scores_w2v)

In [None]:
# normalise bm25 scores, w2v and tfidf cosine similarities are l2 normalised by default

train_scores_bm25 = train_scores_bm25 / np.max(train_scores_bm25, axis=1, keepdims=True)
dev_scores_bm25 = dev_scores_bm25 / np.max(dev_scores_bm25, axis=1, keepdims=True)
test_scores_bm25 = test_scores_bm25 / np.max(test_scores_bm25, axis=1, keepdims=True)

In [None]:
#save to file as pickles
with open('./pickles/train_scores_bm25.pkl', 'wb') as f:
    pickle.dump(train_scores_bm25, f)
with open('./pickles/dev_scores_bm25.pkl', 'wb') as f:
    pickle.dump(dev_scores_bm25, f)
with open('./pickles/test_scores_bm25.pkl', 'wb') as f:
    pickle.dump(test_scores_bm25, f)
    
with open('./pickles/train_scores_tfidf.pkl', 'wb') as f:
    pickle.dump(train_scores_tfidf, f)
with open('./pickles/dev_scores_tfidf.pkl', 'wb') as f:
    pickle.dump(dev_scores_tfidf, f)
with open('./pickles/test_scores_tfidf.pkl', 'wb') as f:
    pickle.dump(test_scores_tfidf, f)
    
with open('./pickles/train_scores_w2v.pkl', 'wb') as f:
    pickle.dump(train_scores_w2v, f)
with open('./pickles/dev_scores_w2v.pkl', 'wb') as f:
    pickle.dump(dev_scores_w2v, f)
with open('./pickles/test_scores_w2v.pkl', 'wb') as f:
    pickle.dump(test_scores_w2v, f)

## Get top-10 evidence based on calculated scores

### The following different scoring combinations are used to get top-10 evidences.

- **bm25:**  bm25 only
- **tfidf:**  tfidf only
- **w2v:**  word2vec only
- **eq:**  33% bm25, 33% tfidf, 33% word2vec

since bm25 performed the best independently, the following combinations are added (geared towards bm25):

- **bmtf:**  50% bm25, 50% tfidf
- **bmw2v:**  50% bm25, 50% w2v

In [None]:
def get_evidence_list_test(w_bm25, w_tfidf, w_w2v, n):
    result = []
    aggregate_scores = test_scores_bm25 * w_bm25 + test_scores_tfidf * w_tfidf + test_scores_w2v * w_w2v
    for scores in tqdm(aggregate_scores, desc="Processing test claims"):
        sorted_indices = np.argsort(scores)
        topn = sorted_indices[-n:][::-1]
        result.append(topn)
    return result

def get_evidence_list_dev(w_bm25, w_tfidf, w_w2v, n):
    result = []
    aggregate_scores = dev_scores_bm25 * w_bm25 + dev_scores_tfidf * w_tfidf + dev_scores_w2v * w_w2v
    for scores in tqdm(aggregate_scores, desc="Processing dev claims"):
        sorted_indices = np.argsort(scores)
        topn = sorted_indices[-n:][::-1]
        result.append(topn)
    return result

def get_evidence_list_train(w_bm25, w_tfidf, w_w2v, n):
    result = []
    aggregate_scores = train_scores_bm25 * w_bm25 + train_scores_tfidf * w_tfidf + train_scores_w2v * w_w2v 
    for scores in tqdm(aggregate_scores, desc="Processing train claims"):
        sorted_indices = np.argsort(scores)
        topn = sorted_indices[-n:][::-1]
        result.append(topn)
    return result

In [None]:
#bm25 only

test_list_bm25 = get_evidence_list_test(1,0,0,10)
dev_list_bm25 = get_evidence_list_dev(1,0,0,10)
train_list_bm25 = get_evidence_list_train(1,0,0,10)

In [None]:
#tfidf only

test_list_tfidf = get_evidence_list_test(0,1,0,10)
dev_list_tfidf = get_evidence_list_dev(0,1,0,10)
train_list_tfidf = get_evidence_list_train(0,1,0,10)

In [None]:
#word2vec only

test_list_w2v = get_evidence_list_test(0,0,1,10)
dev_list_w2v = get_evidence_list_dev(0,0,1,10)
train_list_w2v = get_evidence_list_train(0,0,1,10)

In [None]:
#33% bm25, 33% tfidf, 33% word2vec

test_list_eq = get_evidence_list_test(0.33,0.33,0.33,10)
dev_list_eq = get_evidence_list_dev(0.33,0.33,0.33,10)
train_list_eq = get_evidence_list_train(0.33,0.33,0.33,10)

In [None]:
#50% bm25, 50% tfidf

test_list_bmtf = get_evidence_list_test(0.5,0.5,0,10)
dev_list_bmtf = get_evidence_list_dev(0.5,0.5,0,10)
train_list_bmtf = get_evidence_list_train(0.5,0.5,0,10)

In [None]:
#50% bm25, 50% w2v

test_list_bmw2v = get_evidence_list_test(0.5,0,0.5,10)
dev_list_bmw2v = get_evidence_list_dev(0.5,0,0.5,10)
train_list_bmw2v = get_evidence_list_train(0.5,0,0.5,10)

In [None]:
#save to file as pickles and csv

dev_df['bm25'] = dev_list_bm25
dev_df['tfidf'] = dev_list_tfidf
dev_df['w2v'] = dev_list_w2v
dev_df['eq'] = dev_list_eq
dev_df['bmtf'] = dev_list_bmtf
dev_df['bmw2v'] = dev_list_bmw2v

test_df['bm25'] = test_list_bm25
test_df['tfidf'] = test_list_tfidf
test_df['w2v'] = test_list_w2v
test_df['eq'] = test_list_eq
test_df['bmtf'] = test_list_bmtf
test_df['bmw2v'] = test_list_bmw2v

train_df['bm25'] = train_list_bm25
train_df['tfidf'] = train_list_tfidf
train_df['w2v'] = train_list_w2v
train_df['eq'] = train_list_eq
train_df['bmtf'] = train_list_bmtf
train_df['bmw2v'] = train_list_bmw2v

with open('./pickles/dev_df.pkl', 'wb') as fp:
    pickle.dump(dev_df, fp)
with open('./pickles/test_df.pkl', 'wb') as fp:
    pickle.dump(test_df, fp)
with open('./pickles/train_df.pkl', 'wb') as fp:
    pickle.dump(train_df, fp)
    
dev_df.to_csv('./csv/dev_df.csv')
train_df.to_csv('./csv/train_df.csv')
test_df.to_csv('./csv/test_df.csv')

## Save to required json format to test out evidence retrieval on dev

We are not testing labelling at this stage. All claim-label values is saved with "SUPPORTS" as a placeholder. For evaluation, we will only care about Evidence Retrieval F-score (F) (i.e. Claim Classification Accuracy (A) and Harmonic Mean of F and A (H) ignored).

For each combination, test out different n values of 1,4,5,6,10 and save resulting F-score.

In [None]:
def test_evidences_json(df, evidence_list, n):
    all_data = {}
    for i in range(len(df.index)):
        data = {}
        claim_id = df.loc[i,'index']
        data['claim_text'] = df.loc[i,'claim_text']
        data['claim_label'] = "SUPPORTS"
        evidences = []
        for j in evidence_list[i]:
            if len(evidences) < n:
                evidences.append("evidence-" + str(j))
        data['evidences'] = evidences
        all_data[claim_id] = data
    return all_data

In [None]:
with open('./json/bm25.json', 'w') as f:
    json.dump(test_evidences_json(dev_df,dev_list_bm25,10), f)
with open('./json/tfidf.json', 'w') as f:
    json.dump(test_evidences_json(dev_df,dev_list_tfidf,10), f)
with open('./json/w2v.json', 'w') as f:
    json.dump(test_evidences_json(dev_df,dev_list_w2v,10), f)
with open('./json/eq.json', 'w') as f:
    json.dump(test_evidences_json(dev_df,dev_list_eq,10), f)
with open('./json/bmtf.json', 'w') as f:
    json.dump(test_evidences_json(dev_df,dev_list_bmtf,10), f)
with open('./json/bmw2v.json', 'w') as f:
    json.dump(test_evidences_json(dev_df,dev_list_bmw2v,10), f)

# Part B: Claim Labeling

## B.1 Training model and variables

### Set-up helper functions

In [None]:
def label_to_int(label):
    if label == "SUPPORTS":
        return 0
    elif label == "REFUTES":
        return 1
    elif label == "NOT_ENOUGH_INFO":
        return 2
    elif label == "DISPUTED":
        return 3
    else:
        print("ERROR: invalid label")

def label_to_string(label):
    if label == 0:
        return "SUPPORTS"
    elif label == 1:
        return "REFUTES"
    elif label == 2:
        return "NOT_ENOUGH_INFO"
    elif label == 3:
        return "DISPUTED"
    else:
        print("ERROR: invalid label")

In [None]:
#number of evidence use in prediction
K = 10

#set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Set-up dataloaders

**Train data** will be split into 80% used for training and 20% used for validation in training. The evidence used for training data will be to a total of 10, combining actual evidence, and top-10 evidence as per part A without duplicates.

**Dev data** will then be used to evaluate the results of the different combinations and model. Evidence will be per part A.

**Test data** will be used for codalab submission. Evidence will be per part A.

<b> NOTE:

!! Action point, update evidence retrieval combination here !!

</b>

The below code was run on the best combinations from part A i.e. eq and bmtf. To update to eq, update the below comb from 'bmtf' to 'eq' and run again. Depending on GPU constraints, restarting kernel may be required before re-training

In [None]:
#evidence retrieved from part A to use.
comb = 'bmtf'

In [None]:
train_claims = []
train_evidence = []
train_labels = []

dev_claims = []
dev_evidence_part_a = []
dev_labels = []

for i in range(len(train_df.index)):
    claim = train_df.loc[i, 'claim_text']
    label = label_to_int(train_df.loc[i, 'claim_label'])
    evidence_actual = train_df.loc[i, 'evidences']
    evidence_part_a = train_df.loc[i, comb]
    evidence_list = []
    for i in evidence_actual:
        idx = int(i.split('-')[-1])
        evidence_list.append(evidence_df.loc[idx, 'evidence'])
    for i in evidence_df.loc[evidence_part_a, 'evidence']:
        if len(evidence_list) < K :
            if i not in evidence_list:
                evidence_list.append(i)
        else:
            break
    for i in range(len(evidence_list)):
        train_claims.append(claim)
        train_labels.append(label)
        train_evidence.append(evidence_list[i])

for i in range(len(dev_df.index)):
    claim = dev_df.loc[i, 'claim_text']
    label = label_to_int(dev_df.loc[i, 'claim_label'])
    evidence_list_part_a = dev_df.loc[i, comb]
    for j in range(K):
        evidence_string = evidence_df.loc[evidence_list_part_a[j], 'evidence']
        dev_claims.append(claim)
        dev_labels.append(label)
        dev_evidence_part_a.append(evidence_string)


In [None]:
#split train data to train and validation sets.
train_frac = 0.8
train_evidences_train, train_evidences_val, train_labels_train, train_labels_val = train_test_split(train_evidence, train_labels, test_size=1 - train_frac, random_state=1, stratify=train_labels)
train_claims_train, train_claims_val = train_test_split(train_claims, test_size=1 - train_frac, random_state=1, stratify=train_labels)

assert(len(train_evidences_val) == len(train_labels_val) == len(train_claims_val))

#### RoBERTa models

In [None]:
#setup parameters
config = RobertaConfig.from_pretrained('roberta-base', num_labels=4, hidden_dropout_prob=0.3)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config, ignore_mismatched_sizes=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model.to(device)
epochs = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
max_length = 512
batch_size = 16

#update model loss function to be based on train class distribution
class_weights = [len(train_labels_train) / train_labels_train.count(i) for i in range(4)]
class_weights = torch.tensor(class_weights).to(device)
model.loss_fct = nn.CrossEntropyLoss(weight=class_weights)

In [None]:
class ClaimDataset(Dataset):
    def __init__(self, claims, evidence, labels, tokenizer, max_length):
        self.claims = claims
        self.evidence = evidence
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        claim = self.claims[idx]
        evidence_item = self.evidence[idx]
        claim_and_evidence = claim + " " + evidence_item
        encoding = self.tokenizer.encode_plus(
            claim_and_evidence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        if self.labels is not None:
            label = self.labels[idx]
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(label, dtype=torch.long)
            }
        else:
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
            }

In [None]:
train_dataset = ClaimDataset(train_claims_train, train_evidences_train, train_labels_train, tokenizer, max_length)
val_dataset = ClaimDataset(train_claims_val, train_evidences_val, train_labels_val, tokenizer, max_length)
dev_dataset = ClaimDataset(dev_claims, dev_evidence_part_a, None, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size)

In [None]:
#update learning rate as training goes
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
for epoch in range(epochs):
    #training
    model.train()
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch")
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()  
        progress_bar.set_postfix({"loss": loss.item()})

    #validation
    model.eval()
    progress_bar = tqdm(val_dataloader, desc=f"Validation {epoch + 1}/{epochs}", unit="batch")
    correct_predictions = 0
    total_predictions = 0

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_predictions += labels.size(0)
        progress_bar.set_postfix({"accuracy": correct_predictions / total_predictions})

    validation_accuracy = correct_predictions / total_predictions
    print(f"Validation accuracy: {validation_accuracy:.4f}")

#### BERT models

In [None]:
#setup parameters
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=4, hidden_dropout_prob=0.3)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config, ignore_mismatched_sizes=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model.to(device)
epochs = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
max_length = 512
batch_size = 16

#update model loss function to be based on train class distribution
class_weights = [len(train_labels_train) / train_labels_train.count(i) for i in range(4)]
class_weights = torch.tensor(class_weights).to(device)
model.loss_fct = nn.CrossEntropyLoss(weight=class_weights)

In [None]:
class ClaimDataset(Dataset):
    def __init__(self, claims, evidence, labels, tokenizer, max_length):
        self.claims = claims
        self.evidence = evidence
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        claim = self.claims[idx]
        evidence_item = self.evidence[idx]
        claim_and_evidence = claim + " " + evidence_item
        encoding = self.tokenizer.encode_plus(
            claim_and_evidence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        if self.labels is not None:
            label = self.labels[idx]
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(label, dtype=torch.long)
            }
        else:
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
            }

In [None]:
train_dataset = ClaimDataset(train_claims_train, train_evidences_train, train_labels_train, tokenizer, max_length)
val_dataset = ClaimDataset(train_claims_val, train_evidences_val, train_labels_val, tokenizer, max_length)
dev_dataset = ClaimDataset(dev_claims, dev_evidence_part_a, None, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size)

In [None]:
#update learning rate as training goes
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
for epoch in range(epochs):
    #training
    model.train()
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch")
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()  
        progress_bar.set_postfix({"loss": loss.item()})

    #validation
    model.eval()
    progress_bar = tqdm(val_dataloader, desc=f"Validation {epoch + 1}/{epochs}", unit="batch")
    correct_predictions = 0
    total_predictions = 0

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_predictions += labels.size(0)
        progress_bar.set_postfix({"accuracy": correct_predictions / total_predictions})

    validation_accuracy = correct_predictions / total_predictions
    print(f"Validation accuracy: {validation_accuracy:.4f}")

## Run predictions based on the trained models

### Set-up helper functions

In [None]:
#get predictions based on model
def predict(model, dataloader):
    model.eval()
    predictions = []

    for batch in tqdm(dataloader, desc="Predicting", unit="batch"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

    return predictions

In [None]:
#split evidence to relevant claims i.e. to list of list
def split_evidence(predictions, n):
    return [predictions[i:i+n] for i in range(0, len(predictions), n)]

In [None]:
#given the list of list of evidences per claim, find the majority label
def find_majority_mjr(preds):
    counter = Counter(preds)
    majority, count = counter.most_common(1)[0]
    indexes = [i for i, x in enumerate(preds) if x == majority]
    return majority, indexes

In [None]:
def to_final_json(claim_id, claim_text, evidence_list, labels, num_evidence):
    all_data = {}
    for i in range(len(claim_id)):
        ev_list = evidence_list[i]
        id = claim_id[i]
        data = {}
        text = claim_text[i]
        label = labels[i]
        evidences = []
        for j in range(len(ev_list)):
            if j < num_evidence:
                evidences.append("evidence-" + str(ev_list[j])) 
            else:
                break
        data["claim_text"] = text
        data["claim_label"] = label
        data["evidences"] = evidences
        all_data[id] = data
    return all_data

### Get predictions

In [None]:
#get dev predictions. 
dev_predictions = predict(model, dev_dataloader)
dev_predictions = split_evidence(dev_predictions, K)
print("dev predictions: ")
print(dev_predictions)


In [None]:
#save labels and evidence (per selecting evidence methodology)

final_dev_labels = []
final_dev_evidence_mjr = []
final_dev_evidence_ord = []
for i in range(len(dev_predictions)):
    preds = dev_predictions[i]
    evidence_ids = dev_df.loc[i, comb]
    majority, indexes = find_majority_mjr(preds)
    evidence_mjr = []
    for j in indexes:
        evidence_mjr.append(evidence_ids[j])
    final_dev_labels.append(label_to_string(majority))
    final_dev_evidence_mjr.append(evidence_mjr)
    final_dev_evidence_ord.append(evidence_ids)
    


In [None]:
#save dev in the json format required (with actual predicted labels this time) to analyse performance

dev_4_mjr_json = to_final_json(dev_df['index'],dev_df['claim_text'], final_dev_evidence_mjr, final_dev_labels, 4)
dev_5_mjr_json = to_final_json(dev_df['index'],dev_df['claim_text'], final_dev_evidence_mjr, final_dev_labels, 5)
dev_6_mjr_json = to_final_json(dev_df['index'],dev_df['claim_text'], final_dev_evidence_mjr, final_dev_labels, 6)
dev_4_ord_json = to_final_json(dev_df['index'],dev_df['claim_text'], final_dev_evidence_ord, final_dev_labels, 4)
dev_5_ord_json = to_final_json(dev_df['index'],dev_df['claim_text'], final_dev_evidence_ord, final_dev_labels, 5)
dev_6_ord_json = to_final_json(dev_df['index'],dev_df['claim_text'], final_dev_evidence_ord, final_dev_labels, 6)


In [None]:
#update model name and evidence retrieval methodology in save file name
#as required for other runs e.g. dev_bert_bmtf_4_mjr.json etc.

with open('./json/dev_rbert_eq_4_mjr.json', 'w') as f:
    json.dump(dev_4_mjr_json,f)
with open('./json/dev_rbert_eq_5_mjr.json', 'w') as f:
    json.dump(dev_5_mjr_json,f)
with open('./json/dev_rbert_eq_6_mjr.json', 'w') as f:
    json.dump(dev_6_mjr_json,f)
with open('./json/dev_rbert_eq_4_ord.json', 'w') as f:
    json.dump(dev_4_ord_json,f)
with open('./json/dev_rbert_eq_5_ord.json', 'w') as f:
    json.dump(dev_5_ord_json,f)
with open('./json/dev_rbert_eq_6_ord.json', 'w') as f:
    json.dump(dev_6_ord_json,f)
    

## B.2 Hyperparameter tuning on best performing

Hyperparameter tune on rbert_bmtf_4_ord. Test the following parameters:
- batch_size = 8, learning_rate = 1e-5, epoch = 2
- batch_size = 8, learning_rate = 1e-5, epoch = 3
- batch_size = 8, learning_rate = 3e-5, epoch = 2
- batch_size = 8, learning_rate = 3e-5, epoch = 3
- batch_size = 16, learning_rate = 1e-5, epoch = 2
- batch_size = 16, learning_rate = 1e-5, epoch = 3
- batch_size = 16, learning_rate = 3e-5, epoch = 2
- batch_size = 16, learning_rate = 3e-5, epoch = 3

<b> NOTE:

!! Action point, update batch_size, learning_rate, and epochs for hyperparameter tuning !!

</b>

In [None]:
#code below is basically the code from part 1. update lr, batch_size and epoch to get results.

comb = 'bmtf'
batch_size = 16
learning_rate = 1e-5
epochs = 3

In [None]:
class ClaimDataset(Dataset):
    def __init__(self, claims, evidence, labels, tokenizer, max_length):
        self.claims = claims
        self.evidence = evidence
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        claim = self.claims[idx]
        evidence_item = self.evidence[idx]
        claim_and_evidence = claim + " " + evidence_item
        encoding = self.tokenizer.encode_plus(
            claim_and_evidence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        if self.labels is not None:
            label = self.labels[idx]
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(label, dtype=torch.long)
            }
        else:
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
            }

In [None]:
#setup dataloaders

train_claims = []
train_evidence = []
train_labels = []

dev_claims = []
dev_evidence_part_a = []
dev_labels = []

test_claims = []
test_evidence_part_a = []

for i in range(len(train_df.index)):
    claim = train_df.loc[i, 'claim_text']
    label = label_to_int(train_df.loc[i, 'claim_label'])
    evidence_actual = train_df.loc[i, 'evidences']
    evidence_part_a = train_df.loc[i, comb]
    evidence_list = []
    for i in evidence_actual:
        idx = int(i.split('-')[-1])
        evidence_list.append(evidence_df.loc[idx, 'evidence'])
    for i in evidence_df.loc[evidence_part_a, 'evidence']:
        if len(evidence_list) < K :
            if i not in evidence_list:
                evidence_list.append(i)
        else:
            break
    for i in range(len(evidence_list)):
        train_claims.append(claim)
        train_labels.append(label)
        train_evidence.append(evidence_list[i])

for i in range(len(dev_df.index)):
    claim = dev_df.loc[i, 'claim_text']
    label = label_to_int(dev_df.loc[i, 'claim_label'])
    evidence_list_part_a = dev_df.loc[i, comb]
    for j in range(K):
        evidence_string = evidence_df.loc[evidence_list_part_a[j], 'evidence']
        dev_claims.append(claim)
        dev_labels.append(label)
        dev_evidence_part_a.append(evidence_string)

for i in range(len(test_df.index)):
    claim = test_df.loc[i, 'claim_text']
    evidences_list_part_a = test_df.loc[i, comb]
    for j in range(K):
        evidence_string = evidence_df.loc[evidences_list_part_a[j], 'evidence']
        test_claims.append(claim)
        test_evidence_part_a.append(evidence_string)
        
#split train data to train and validation sets.
train_frac = 0.8
train_evidences_train, train_evidences_val, train_labels_train, train_labels_val = train_test_split(train_evidence, train_labels, test_size=1 - train_frac, random_state=1, stratify=train_labels)
train_claims_train, train_claims_val = train_test_split(train_claims, test_size=1 - train_frac, random_state=1, stratify=train_labels)

assert(len(train_evidences_val) == len(train_labels_val) == len(train_claims_val))

#setup parameters
config = RobertaConfig.from_pretrained('roberta-base', num_labels=4, hidden_dropout_prob=0.3)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config, ignore_mismatched_sizes=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
max_length = 512

#update model loss function to be based on train class distribution
class_weights = [len(train_labels_train) / train_labels_train.count(i) for i in range(4)]
class_weights = torch.tensor(class_weights).to(device)
model.loss_fct = nn.CrossEntropyLoss(weight=class_weights)

train_dataset = ClaimDataset(train_claims_train, train_evidences_train, train_labels_train, tokenizer, max_length)
val_dataset = ClaimDataset(train_claims_val, train_evidences_val, train_labels_val, tokenizer, max_length)
dev_dataset = ClaimDataset(dev_claims, dev_evidence_part_a, dev_labels, tokenizer, max_length)
test_dataset = ClaimDataset(test_claims, test_evidence_part_a, None, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

#update learning rate as training goes
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
validation_accuracy = -1
for epoch in range(epochs):
    #training
    model.train()
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch")
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()  
        progress_bar.set_postfix({"loss": loss.item()})

    #validation
    model.eval()
    progress_bar = tqdm(val_dataloader, desc=f"Validation {epoch + 1}/{epochs}", unit="batch")
    correct_predictions = 0
    total_predictions = 0

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_predictions += labels.size(0)
        progress_bar.set_postfix({"accuracy": correct_predictions / total_predictions})

    validation_accuracy = correct_predictions / total_predictions
    print(f"Validation accuracy: {validation_accuracy:.4f}")

print(f"Last validation accuracy for hyperparameter: Learning rate = {learning_rate}, Batch size = {batch_size}, Validation accuracy = {validation_accuracy}")

In [None]:
#get dev results and analyse

dev_predictions = predict(model, dev_dataloader)
dev_predictions = split_evidence(dev_predictions, K)
print("dev predictions: ")
print(dev_predictions)

final_dev_labels = []
final_dev_evidence_ord = []
for i in range(len(dev_predictions)):
    preds = dev_predictions[i]
    evidence_ids = dev_df.loc[i, comb]
    majority, indexes = find_majority_mjr(preds)
    final_dev_labels.append(label_to_string(majority))
    final_dev_evidence_ord.append(evidence_ids)

dev_4_ord_json = to_final_json(dev_df['index'],dev_df['claim_text'], final_dev_evidence_ord, final_dev_labels, 4)

#update file name based on batch size, learning rate, and number of epoch used 

with open('./json/dev_final_8_1e5_2.json', 'w') as f:
    json.dump(dev_4_ord_json,f)

### get test prediction on final best hyperparameter tuned model

<b> NOTE:

!! Action point, make sure the last model trained is the best/final model before running the following code for test predictions !!

</b>

In [None]:
#setup dataloaders

test_claims = []
test_evidence_part_a = []

for i in range(len(test_df.index)):
    claim = test_df.loc[i, 'claim_text']
    evidences_list_part_a = test_df.loc[i, comb]
    for j in range(K):
        evidence_string = evidence_df.loc[evidences_list_part_a[j], 'evidence']
        test_claims.append(claim)
        test_evidence_part_a.append(evidence_string)
        
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
max_length = 512

test_dataset = ClaimDataset(test_claims, test_evidence_part_a, None, tokenizer, max_length)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


In [None]:
test_predictions = predict(model, test_dataloader)
test_predictions = split_evidence(test_predictions, K)

final_test_labels = []
final_test_evidence_ord = []

for i in range(len(test_predictions)):
    preds = test_predictions[i]
    evidence_ids = test_df.loc[i, comb]
    majority, indexes = find_majority_mjr(preds)
    final_test_labels.append(label_to_string(majority))
    final_test_evidence_ord.append(evidence_ids)

test_4_ord_json = to_final_json(test_df['index'],test_df['claim_text'], final_test_evidence_ord, final_test_labels, 4)

with open('test-claims-predictions.json', 'w') as f:
    json.dump(test_4_ord_json,f)

# Load Preprocessed Data and Scores

if any pickled files need to be reloaded, just update the file path and variable name in the below.

In [None]:
file = open("./pickles/evidence_df.pkl",'rb')
evidence_df = pickle.load(file)

file = open("./pickles/dev_df.pkl",'rb')
dev_df = pickle.load(file)

file = open("./pickles/test_df.pkl",'rb')
test_df = pickle.load(file)

file = open("./pickles/train_df.pkl",'rb')
train_df = pickle.load(file)


In [None]:
file = open("./pickles/dev_scores_bm25.pkl",'rb')
dev_scores_bm25 = pickle.load(file)

file = open("./pickles/test_scores_bm25.pkl",'rb')
test_scores_bm25 = pickle.load(file)

file = open("./pickles/train_scores_bm25.pkl",'rb')
train_scores_bm25 = pickle.load(file)

file = open("./pickles/dev_scores_tfidf.pkl",'rb')
dev_scores_tfidf = pickle.load(file)

file = open("./pickles/test_scores_tfidf.pkl",'rb')
test_scores_tfidf = pickle.load(file)

file = open("./pickles/train_scores_tfidf.pkl",'rb')
train_scores_tfidf = pickle.load(file)

file = open("./pickles/dev_scores_w2v.pkl",'rb')
dev_scores_w2v = pickle.load(file)

file = open("./pickles/test_scores_w2v.pkl",'rb')
test_scores_w2v = pickle.load(file)

file = open("./pickles/train_scores_w2v.pkl",'rb')
train_scores_w2v = pickle.load(file)