#### Install libs

In [126]:
#!pip install pandas
#!pip install numpy
#!pip install tensorflow

#!pip install transformers
#!pip install evaluate
#!pip install datasets


#### Imports

In [124]:
import pandas as pd
import numpy as np

# Tensorflow imports
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

# Hugging face imports
from datasets import load_dataset
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import create_optimizer
import evaluate



In [None]:
# This should point to FactCheckNLPApp/
BASE_PATH = '../'

#### Download data

- This uses data from https://github.com/neemakot/Health-Fact-Checking.
- First clone this repo in your local workspace. 
- Then run download_data script

In [120]:
# !../Health-Fact-Checking/src/download_data.sh

#### Preprocess data and select topk sentences from main text

- This can take a few hours on a single GPU 
- Skip this step and download files directly if you don't want to change pre-processing steps

In [1]:
from fact_check_nlp.preprocessing import select_evidence_sentences_based_on_cosine_similarity

In [3]:
train_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/train.tsv'
dev_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/dev.tsv'
test_path = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/test.tsv'

select_evidence_sentences_based_on_cosine_similarity(train_path, k=5, 
                                output_path=train_path.replace('train.tsv', 'formatted_train.csv')
                               )

select_evidence_sentences_based_on_cosine_similarity(dev_path, k=5, 
                                output_path=dev_path.replace('dev.tsv', 'formatted_dev.csv')
                               )

select_evidence_sentences_based_on_cosine_similarity(test_path, k=5, 
                                output_path=test_path.replace('test.tsv', 'formatted_test.csv')
                               )


Writing to /Users/neeteshtiwari/Documents/FactCheckNLPApp/Health-Fact-Checking/data/PUBHEALTH/formatted_train.csv
Writing to /Users/neeteshtiwari/Documents/FactCheckNLPApp/Health-Fact-Checking/data/PUBHEALTH/formatted_dev.csv
Writing to /Users/neeteshtiwari/Documents/FactCheckNLPApp/Health-Fact-Checking/data/PUBHEALTH/formatted_test.csv


#### Training 

- Current this trains a hugging face dummy model just to see if everything works
- It basically only claim text and labels to call fit and then predict
- Results will not make sense at the moment

#### Load Dataset

In [66]:
train_file = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_train.csv' 
val_file = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_dev.csv' 
test_file = f'{BASE_PATH}Health-Fact-Checking/data/PUBHEALTH/formatted_test.csv' 


dataset = load_dataset("csv", data_files=[train_file])
val_dataset = load_dataset("csv", data_files=[val_file])
test_dataset = load_dataset("csv", data_files=[test_file])


Using custom data configuration default-a3d2975df6d9a990
Found cached dataset csv (/Users/neeteshtiwari/.cache/huggingface/datasets/csv/default-a3d2975df6d9a990/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-552ef83724876e0d
Found cached dataset csv (/Users/neeteshtiwari/.cache/huggingface/datasets/csv/default-552ef83724876e0d/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-a675004b814f61b4
Found cached dataset csv (/Users/neeteshtiwari/.cache/huggingface/datasets/csv/default-a675004b814f61b4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

#### Tokenize Dataset

In [68]:
id2label = {0: "true", 1: "false", 2: "mixture", 3: "unproven"}
label2id = {"true": 0, "false": 2, "mixture":3, "unproven": 4}

def preprocess_function(examples, col1='claim', col2='top_k', labels=[]):    
    inputs = tokenizer(
        examples[col1],
        examples[col2],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )
    offset_mapping = inputs.pop("offset_mapping")
    inputs['label'] = examples['label']
    return inputs

def preprocess_claim(examples):
    inputs = tokenizer(examples["claim"], truncation=True)

    # print(np.unique(examples['label']))
    converted_lbls = []
    for val in examples['label']:
        converted_lbls.append(label2id.get(val, -2))

    # print(np.unique(converted_lbls))    
    inputs['label'] = converted_lbls
    return inputs

In [69]:

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_train = dataset.map(preprocess_claim, batched=True, remove_columns=dataset["train"].column_names)
tokenized_val = val_dataset.map(preprocess_claim, batched=True, remove_columns=val_dataset["train"].column_names)
tokenized_test = test_dataset.map(preprocess_claim, batched=True, remove_columns=test_dataset["train"].column_names)


  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [71]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

accuracy = evaluate.load("accuracy")

#### Load Pre-trained Model & Traing it only on Claim texts just to check if everything works

In [95]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenized_data = tokenizer(dataset["train"]['claim'], return_tensors="np", padding=True)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
tokenized_data = dict(tokenized_data)

labels = []
for lbl in dataset["train"]['label']:
    labels.append(label2id.get(lbl, 6))
labels = np.array(labels)  # Label is already an array of 0 and 1

# Load and compile our model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
# Lower learning rates are often better for fine-tuning transformers
model.compile(optimizer=Adam(3e-5))

model.fit(tokenized_data, labels)

#### Predict on test

In [114]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
classifier(test_dataset["train"]["claim"])


[{'label': 'LABEL_0', 'score': 0.7489774823188782},
 {'label': 'LABEL_0', 'score': 0.7489068508148193},
 {'label': 'LABEL_0', 'score': 0.7489771246910095},
 {'label': 'LABEL_0', 'score': 0.7485695481300354},
 {'label': 'LABEL_0', 'score': 0.749027669429779},
 {'label': 'LABEL_0', 'score': 0.7485504746437073},
 {'label': 'LABEL_0', 'score': 0.7491540908813477},
 {'label': 'LABEL_0', 'score': 0.7491301894187927},
 {'label': 'LABEL_0', 'score': 0.7490165829658508},
 {'label': 'LABEL_0', 'score': 0.7491080164909363}]