### Claim Classification

#### Adapted from DistillBert example on HuggingFace website

In [None]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [None]:
# Setting up the device for GPU usage

import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")
    
device = mps_device    

<a id='section02'></a>
### Importing and Pre-Processing the domain data


In [None]:
# This should point to FactCheckNLPApp/
BASE_PATH = './'
LABEL_TO_ID = {"true": 0, "false": 1, "mixture": 1, "unproven": 1}
NUM_CLASSES = 2
CLAIM_feature = 'claim'

train_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_train_most_similar.csv'
dev_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_dev_most_similar.csv'
test_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_test_most_similar.csv'
FEATURES = ['claim','top_k', 'label']
EVIDENCE_FEATURE = 'top_k'

'''
train_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/train_claim_sentence_pair.csv'
dev_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/dev_claim_sentence_pair.csv'
test_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/test_claim_sentence_pair.csv'
FEATURES = ['claim', 'evidence_sentence', 'label']

EVIDENCE_FEATURE = 'evidence_sentence'
'''

In [None]:
def add_some_random_train_data():    
    lst = []
    lst.append({
        'claim': "earth is round.",
        'top_k': "earth is round studies have proven. Flat earthers have give up. It has been proven scientifically. There is no doubt about this fact. Several studies have confirmed. Satellites rely on the roundness of earth. It has been established beyond doubt that earth is spherical.",
        'label': 0
    })
    lst.append({
        'claim': "There are 365 days in a year.",
        'top_k': "One year has 365 days is a known fact. This doesnot require any proof. It has always been True.",
        'label': 0
    })
    lst.append({
        'claim': "Being healthy is good for the human body",
        'top_k': "It is true that Being healthy is good for the human body. Running/Walking or any form of exercise really good for you. As it promotes good health.",
        'label': 0
    })
    lst.append({
        'claim': "Excessive sugar consumption is bad for you.",
        'top_k': "Studies have proven that too much sugar consumption is not good for you. It can cause weight gain. It can also cause other health issues. Avoiding sugar is good. Studies have proven that excessive sugar consumption is bad for your healthy.",
        'label': 0
    })
    
    lst.append({
        'claim': "Drinking water is good for you.",
        'top_k': "It is true that drinking water is good for you. All published research indicates that drinking water is good for you.",
        'label': 0
    })

    lst.append({
        'claim': "This claim is True.",
        'top_k': "This claim is not wrong. This claim is right. This claim is correct. This claim has been proven correct.",
        'label': 0
    })

    lst.append({
        'claim': "This claim is False.",
        'top_k': "This claim is indeed incorrect. This claim is not right. This claim is frivolous. It is based on fabricated research. It is based on baseless claims.",
        'label': 0
    })

    # False claims
    lst.append({
        'claim': "This claim is True.",
        'top_k': "This claim is indeed incorrect. This claim is not right. This claim is frivolous. It is based on fabricated research. It is based on baseless claims.",
        'label': 1
    })    
    
    lst.append({
        'claim': "Exercise is bad for you.",
        'top_k': "Exercise is good for you. It has been proven mutliple times. All published studies indicate the benefits of exercise on health.",
        'label': 1
    })
    
    return pd.DataFrame(lst)    

In [None]:

def prepare_df(file_path):
    df = pd.read_csv(file_path)
    df = df[FEATURES]
    print(df['label'].unique())
    # df = df[(df['label'].isin(LABEL_TO_ID.keys()))]
    df.reset_index(inplace=True, drop=True)

    encoded_lbls = []
    for lbl in df['label'].values:
        encoded_lbls.append(LABEL_TO_ID[lbl])
    df['label'] = encoded_lbls
    return df

train_df = prepare_df(train_path)
valid_df = prepare_df(dev_path)
test_df = prepare_df(test_path)

In [None]:

train_df = pd.concat([train_df, add_some_random_train_data()]).reset_index(drop=True)

<a id='section03'></a>
### Preparing the Dataset and Dataloader



In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 1
EPOCHS = 2
LEARNING_RATE = 1e-05
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, claim_index_map):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.claim_index_map = claim_index_map
        
    def __getitem__(self, index):
        claim = self.data.claim[index]
        evidence = self.data[EVIDENCE_FEATURE].values[index]

        title = str(claim + " " + evidence)
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'claim_index': self.claim_index_map[claim],
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.label[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len
    
    
def claim_index(df):
    claim_idx_map = {}
    vals = df['claim'].unique()
    idx = 0
    for val in vals:
        claim_idx_map[val] = idx
        idx += 1
        
    return claim_idx_map
    

In [None]:
# Creating the dataset and dataloader for the neural network

train_dataset= train_df.reset_index(drop=True)
train_claim_idx_map = claim_index(train_dataset)

valid_dataset= valid_df.reset_index(drop=True)
valid_claim_idx_map = claim_index(valid_dataset)

test_dataset= test_df.reset_index(drop=True)
test_claim_idx_map = claim_index(test_dataset)

print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(valid_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN, train_claim_idx_map)
valid_set = Triage(valid_dataset, tokenizer, MAX_LEN, valid_claim_idx_map)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN, test_claim_idx_map)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
valid_loader = DataLoader(valid_set, **test_params)
testing_loader = DataLoader(testing_set, **test_params)

<a id='section04'></a>
### Creating the Neural Network for Fine Tuning

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained(MODEL_NAME)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, NUM_CLASSES)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = DistillBERTClass()
model.to(device)

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

<a id='section05'></a>
### Fine Tuning the Model

In [None]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 


def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    idx = 0
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    print(nb_tr_steps, nb_tr_examples)
    print(tr_loss, n_correct)
    
    preds = []
    lbls = []
    claim_ids = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            nb_tr_steps += 1            
            idx += 1
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            index = data['claim_index'].item()
            claim_ids.append(index)
            
            targets = data['targets'].to(device, dtype = torch.long)
            nb_tr_examples+=targets.size(0)
            
            outputs = model(ids, mask).squeeze()

            # print(outputs)
            # print(targets)

            # loss = loss_function(outputs, targets)
            # tr_loss += loss.item()
            prob = torch.max(outputs.data).item()
            big_idx = torch.argmax(outputs.data) #, dim=1
            n_correct += calcuate_accu(big_idx, targets)                

            preds.append(big_idx.item())
            lbls.append(targets.item())

            if idx%1000 == 0:
                # loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                # print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 1000 steps: {accu_step}")
                # break
                
    print(nb_tr_steps, nb_tr_examples)
    # print(tr_loss, n_correct)
    # epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    # print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu, preds, prob, lbls, claim_ids

In [None]:
for epoch in range(EPOCHS):
    print(f'Epoch: {epoch}')
    
    train(epoch)
    valid(model, valid_loader)

In [None]:

claim_text = "Chocolates are sweet."
evidence_text = "Chocolates are sweet."

claim_text = "Chocolates are not sweet"
evidence_text = "Chocolates are not sweet."

claim_text = "Chocolates are sweet"
evidence_text = "Chocolates are sweet."

claim_text = "Humans are honest"
evidence_text = "Humans are honest."

claim_text = "Sky is blue."
evidence_text = "Sky is blue"

## seems like period makes difference
title = str(claim_text + " " + evidence_text)
title = " ".join(title.split())
inputs = tokenizer.encode_plus(
    title,
    None,
    add_special_tokens=True,
    max_length=512,
    pad_to_max_length=True,
    return_token_type_ids=True,
    truncation=True
)
ids = torch.tensor([inputs['input_ids']], dtype=torch.long).to(device)
mask = torch.tensor([inputs['attention_mask']], dtype=torch.long).to(device)

outputs = model(ids, mask).squeeze()
prob = torch.max(outputs.data).item()
big_idx = torch.argmax(outputs.data) #, dim=1
outputs

<a id='section06'></a>
### Validating the Model

In [None]:
from sklearn.metrics import classification_report

acc, preds, pred_prob, lbls, claim_ids = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

print(classification_report(y_true=lbls, y_pred=preds, digits=4))

In [None]:
from collections import Counter
def find_max(vals):
    lst = []
    for val in vals:
        lst.append(val.item())
    return max(lst)

def find_majority(vals):    
    lst = list(vals)
    return max(lst, key=lst.count)

def map_gt(gt):
    lst = list(gt)
    vals = []
    for val in lst:
        vals.append(LABEL_TO_ID[val])
    return vals

In [None]:
pred_df = pd.DataFrame({
    'claim_id': claim_ids,
    'preds': preds,
    'pred_prob': pred_prob,
    'label': lbls
})

pred_df = pred_df.sort_values(by='claim_id').reset_index(drop=True)

In [None]:
# pred_df.groupby(['claim_id', 'label']).max()

pred_df['pred_prob'].unique()

In [None]:
new_pred_df = pred_df.groupby(['claim_id', 'label'])['preds'].apply(lambda x : find_majority(x)).reset_index()

new_pred_df

In [None]:
from sklearn.metrics import classification_report

lbls = new_pred_df['label'].values
preds = new_pred_df['preds'].values 
print(classification_report(lbls, preds, digits=4))

In [None]:
preds

<a id='section07'></a>
### Saving the Trained Model  for inference

In [None]:
torch.save(model.state_dict(), '/tmp/claim_classification_true_false_distillbert-2epochs')