#### Install libs

In [None]:
#!pip install scikit-learn
#!pip install nltk
#!pip install sentence-transformers

In [None]:
#!pip install pandas
#!pip install numpy
#!pip install tensorflow

#!pip install transformers
#!pip install evaluate
#!pip install datasets


#### Imports

In [None]:
import pandas as pd
import numpy as np

# Hugging face imports
from datasets import load_dataset
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import create_optimizer
import evaluate


In [None]:
# This should point to FactCheckNLPApp/
BASE_PATH = '../'

train_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/train.tsv'
dev_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/dev.tsv'
test_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/test.tsv'


#### Download data

- This uses data from https://github.com/neemakot/Health-Fact-Checking.
- First clone this repo in your local workspace. 
- Then run download_data script

In [None]:
# !../Health-Fact-Checking/src/download_data.sh

#### Preprocess data and select topk sentences from main text

- This can take a few hours on a single GPU 
- Skip this step and download files directly if you don't want to change pre-processing steps

In [None]:
from fact_check_nlp.preprocessing import select_evidence_sentences_based_on_cosine_similarity, \
create_claim_sentence_pair

In [None]:
create_claim_sentence_pair(train_path, 
                           output_path=train_path.replace('train.tsv', 'train_claim_sentence_pair.csv'))

create_claim_sentence_pair(dev_path, 
                           output_path=dev_path.replace('dev.tsv', 'dev_claim_sentence_pair.csv'))

create_claim_sentence_pair(test_path, 
                           output_path=test_path.replace('test.tsv', 'test_claim_sentence_pair.csv'))


#### Select top_k based on cosine similarity

In [None]:

select_evidence_sentences_based_on_cosine_similarity(train_path, k=5, 
                                output_path=train_path.replace('train.tsv', 'formatted_train_most_similar.csv')
                               )

select_evidence_sentences_based_on_cosine_similarity(dev_path, k=5, 
                                output_path=dev_path.replace('dev.tsv', 'formatted_dev_most_similar.csv')
                               )

select_evidence_sentences_based_on_cosine_similarity(test_path, k=5, 
                                output_path=test_path.replace('test.tsv', 'formatted_test_most_similar.csv')
                               )


### What does sentence transformer do ?

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer

sentence_transformer_model = SentenceTransformer('bert-base-nli-mean-tokens')
    sentence_transformer_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
claim = "Earth is flat."

lst = [       
       "The earth is round.",
       "Earth is not flat.",
       "Earth is a good planet.",
       "Earth is mostly round." 
      ]

text = " ".join(lst)
text

In [None]:
from sentence_transformers import SentenceTransformer
sentences = lst

model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
embeddings = model.encode(lst)
print(embeddings)

claim_vec = model.encode(claim)



In [None]:
claim

In [None]:
new_emb = {}
for s, e in zip(sentences, embeddings):
    new_emb[s] = np.linalg.norm(cosine_similarity([claim_embedding, e]))
    
new_emb    

In [None]:
claim_embedding

In [None]:
from operator import itemgetter

k = 3

sentences = [sentence for sentence in sent_tokenize(text)]

sentence_embeddings = sentence_transformer_model.encode(sentences)
claim_embedding = sentence_transformer_model.encode(claim)

sentence_embeddings = sentence_embeddings
cosine_similarity_emb = {}

for sentence, embedding in zip(sentences, sentence_embeddings):
    cosine_similarity_emb[sentence] = np.linalg.norm(cosine_similarity(
        [claim_embedding, embedding]))
    top_k = dict(sorted(cosine_similarity_emb.items(),
                            key=itemgetter(1), reverse=True)[:k])

In [None]:
cosine_similarity_emb

In [None]:
top_k

### Section 1 - Create Dataset with all facts and only True/False labels

This cell basically drops any missing rows with missing data. Filters only to Health records and True/False.

In [None]:
test_file = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_test.csv' 
df = pd.read_csv(test_file)
df['subjects'] = df['subjects'].str.lower()
df

In [None]:
train_file = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_health_only_train_most_similar.csv' 
val_file = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_health_only_dev_most_similar.csv' 
test_file = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_health_only_test_most_similar.csv' 

def filter_df(file):
    df = pd.read_csv(file)
    df.dropna(how='any', inplace=True)
    # print(df.columns)
    df['subjects'] = df['subjects'].str.lower()
    # df = df[(df['subjects'].str.contains('health'))]
    df = df[(df['label'].isin(['true', 'false', 'True', 'False']))]
    
    df.to_csv(file.replace('formatted', 'formatted_health'), index=False)
    print(len(df))
    return df

df = filter_df(train_file)
df = filter_df(val_file)
df = filter_df(test_file)


### Load dataset

In [None]:
train_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_health_health_only_train_most_similar.csv'
dev_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_health_health_only_dev_most_similar.csv'
test_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_health_health_only_test_most_similar.csv'


dataset = load_dataset("csv", data_files=[train_path])
val_dataset = load_dataset("csv", data_files=[dev_path])
test_dataset = load_dataset("csv", data_files=[test_path])


### Section 2 Train summary model

In [None]:
SUMMARY_MODEL_NAME = "gpt2"

In [None]:
summary_tokenizer = GPT2Tokenizer.from_pretrained(SUMMARY_MODEL_NAME)
summary_model = GPT2PreTrainedModel.from_pretrained(SUMMARY_MODEL_NAME)

In [None]:
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import GPT2Tokenizer, GPT2PreTrainedModel, GPT2ForSequenceClassification

train_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_train_most_similar.csv'
dev_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_dev_most_similar.csv'
test_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_test_most_similar.csv'


dataset = load_dataset("csv", data_files=[train_path])
val_dataset = load_dataset("csv", data_files=[dev_path])
test_dataset = load_dataset("csv", data_files=[test_path])


SUMMARY_MODEL_NAME = "t5-small"

summary_tokenizer = AutoTokenizer.from_pretrained(SUMMARY_MODEL_NAME)
summary_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARY_MODEL_NAME)

data_collator = DataCollatorForSeq2Seq(tokenizer=summary_tokenizer, model=summary_model)




In [None]:

def preprocess_function_summary(examples):
    prefix = "summarize: "
    inputs = [prefix + doc for doc in examples["top_k"]]
    model_inputs = summary_tokenizer(inputs, max_length=1024, truncation=True)

    labels = summary_tokenizer(text_target=examples["explanation"], 
                               max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_train_sum = dataset.shuffle(seed=42).remove_columns(["label", "subjects"]).map(preprocess_function_summary, batched=True)
tokenized_val_sum = val_dataset.shuffle(seed=42).remove_columns(["label", "subjects"]).map(preprocess_function_summary, batched=True)
tokenized_test_sum = test_dataset.shuffle(seed=42).remove_columns(["label", "subjects"]).map(preprocess_function_summary, batched=True)



In [None]:
import numpy as np
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics_summary(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = summary_tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, summary_tokenizer.pad_token_id)
    decoded_labels = summary_tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != summary_tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


training_args = Seq2SeqTrainingArguments(
    output_dir=f"health_summary_model_true_false_{SUMMARY_MODEL_NAME}",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    # fp16=True,
    push_to_hub=False
)

trainer = Seq2SeqTrainer(
    model=summary_model,
    args=training_args,
    train_dataset=tokenized_train_sum["train"], #.select(list(np.arange(0, 100))),
    eval_dataset=tokenized_val_sum["train"], #.select(list(np.arange(0, 100))),
    tokenizer=summary_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_summary
)

trainer.train()

In [None]:
preds = trainer.predict(tokenized_test_sum["train"])


In [None]:
preds.metrics

In [None]:
lbl = preds.label_ids[0]
lbl = np.where(lbl != -100, lbl, tokenizer.pad_token_id)
summary_tokenizer.batch_decode(lbl, skip_special_tokens=True)

In [None]:
text = tokenized_train_sum['train'][1]['top_k']
text

In [None]:
explanation = tokenized_train_sum['train'][1]['explanation']
explanation

In [None]:
rouge.compute(predictions=preds, references=label, use_stemmer=True)

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=summary_model, tokenizer=summary_tokenizer, max_length=90)

for i in range(0, len(tokenized_test_sum['train'])):
    text = tokenized_test_sum['train'][i]['top_k']
    label = [tokenized_test_sum['train'][i]['explanation']]
    preds = [summarizer(text)[0]['summary_text']]
    metrics = rouge.compute(predictions=preds, references=label, use_stemmer=True)
    if metrics['rouge1'] > 0.50 or metrics['rouge2'] > 0.40 or metrics['rougeL'] > 0.35:
        print(metrics)
        print(text)
        print(label)
        print(preds)        

### Section 3 -Train model for Predicting if a Claim is True/False

In [None]:
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

id2label = {0: "true", 1: "false", 1: "mixture", 1: "unproven"}
label2id = {"true": 0, "false": 1, "mixture": 1, "unproven": 1}


id2label = {0: "True", 1: "False", 1: "mixture", 1: "unproven",
            0: "true", 1: "false", 1: "mixture", 1: "unproven"
           }
label2id = {"True": 0, "False": 1, "mixture": 1, "unproven": 1,
           "true": 0, "false": 1, "mixture": 1, "unproven": 1
           }

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

accuracy = evaluate.load("accuracy")

#### Load Pre-trained Model & Traing it only on Claim texts just to check if everything works

In [None]:
len(tokenized_train["train"])

In [None]:
model_ckpt = "bert-base-cased"

#model_ckpt = "allenai/scibert_scivocab_uncased"

def preprocess_claim(examples):
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    inputs = tokenizer(examples["claim"], examples["top_k"], truncation="only_second")
    
    converted_lbls = []
    for val in examples['label']:
        converted_lbls.append([label2id[str(val)]])

    inputs['label'] = converted_lbls
    return inputs

tokenized_train = dataset.shuffle(seed=42).map(preprocess_claim, batched=True, 
                              remove_columns=dataset["train"].column_names)
tokenized_val = val_dataset.shuffle(seed=42).map(preprocess_claim, batched=True, 
                                remove_columns=val_dataset["train"].column_names)
tokenized_test = test_dataset.shuffle(seed=42).map(preprocess_claim, batched=True, 
                                  remove_columns=test_dataset["train"].column_names)


In [None]:
# Import libraries
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

from transformers import default_data_collator
from torch import nn

mps_device = torch.device("mps")

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)
model

# Define training parameters
batch_size = 16
args = TrainingArguments(
    f"bert-base-cased-healthonly-true-false",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1, # Set num_train_epochs to 1 as test
    weight_decay=0.01,
)


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.15, 0.85]))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model,
    args,
    train_dataset=tokenized_train["train"], #.select(list(np.arange(1, 1000))),
    eval_dataset=tokenized_val["train"], #.select(list(np.arange(1, 100))),
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
import evaluate

preds = trainer.predict(tokenized_test["train"])

print(preds.predictions.shape, preds.label_ids.shape)
preds.predictions[0]

metric = evaluate.load("glue", "mrpc")
pred_lbl = np.argmax(preds.predictions, axis=-1)

metric.compute(predictions=pred_lbl, references=preds.label_ids)


In [None]:
from sklearn.metrics import classification_report

print(classification_report(preds.label_ids, pred_lbl, digits=4))

In [None]:
tokenized_test["train"][0]

In [None]:
inputs

In [None]:
claim = "Being inactive improves heart health."

top_k = "There is no evidence that being sedentary improves heart health. This claim is totally baseless. In face being inactive can serious issues as we age."

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
inputs = tokenizer(claim, top_k, truncation="only_second")

inputs['label'] = [1]

trainer.predict([inputs])