In [None]:
# !pip install transformers==4.28.0 datasets evaluate rouge_score
# !pip install -U sentence-transformers 
# !pip install bert-extractive-summarizer

In [None]:
from datasets import load_dataset, Dataset
from summarizer.sbert import SBertSummarizer
import transformers
from evaluate import load
from huggingface_hub import notebook_login
from transformers import AutoTokenizer,pipeline
from transformers import Trainer as HFTrainer
from transformers import TrainingArguments
from transformers import AutoModel, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments,Seq2SeqTrainer
from transformers import RobertaTokenizer, RobertaForSequenceClassification,AutoModelForSequenceClassification

import torch
import os
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
import warnings
warnings.filterwarnings("ignore")
import process_data
import evaluate_model
import inference
import train
import  importlib
importlib.reload(process_data)
importlib.reload(train)
importlib.reload(inference)
importlib.reload(evaluate_model)
from process_data import processData
from train import Trainer
from inference import *
from evaluate_model import Evaluate

In [None]:
data_folder = 'data'
PARAMS = {'no_of_train': 649, 
          'no_of_test' :  50,
          'max_input_length': 1024,
        'max_target_length':128,
             'max_new_length': 1024,
             'min_length': 200,
         'eval_strategy': 'epoch',
         'batch_size': 8,
        'gradient_accumulation_steps':1,
         'lr':2e-5,
         'decay':0.01,
          'epochs':1,
          'metric': 'rouge',
          'fp16': True,
          'load_best_model' : False,
          'save_strategy': 'epoch',
          'save_total_limit':2
         }

PARAMS_FLAN = {'no_of_train': 649, 
          'no_of_test' :  50,
          'max_input_length': 1024,
        'max_target_length':128,
             'max_new_length': 1024,
             'min_length': 200,
         'eval_strategy': 'epoch',
         'batch_size': 4,
        'gradient_accumulation_steps':1,
         'lr':1e-5,
         'decay':0.01,
          'epochs':10,
            'metric': 'rouge',
            'fp16': False,
            'load_best_model' : False,
          'save_strategy': 'no',
          'save_total_limit':2
         }
PARAMS_BART = {'no_of_train': 649, 
          'no_of_test' :  50,
          'max_input_length': 1024,
        'max_target_length':128,
             'max_new_length': 1024,
             'min_length': 200,
         'eval_strategy': 'epoch',
         'batch_size': 1,
        'gradient_accumulation_steps':4,
         'lr':1e-5,
         'decay':0.01,
          'epochs':10,
            'metric': 'rouge',
            'fp16': True,
            'load_best_model' : False,
          'save_strategy': 'no',
          'save_total_limit':2
         }

In [None]:
push_to_hub = False
if push_to_hub:
    from huggingface_hub import notebook_login
    notebook_login()
print(transformers.__version__)

In [None]:
pD = processData(data_folder)
dataset = pD.read_data_huggingface('scidcc_climate.csv', test_name = 'NA')
dataset = pD.train_test_split(dataset,no_of_train=PARAMS['no_of_train'], no_of_test = PARAMS['no_of_test'])
pD.save_dataset(dataset['train'],'train.csv')
pD.save_dataset(dataset['test'],'test.csv')
labels = [sample['Saved'] for sample in dataset['test']]


## Extractive Summarization

In [None]:
ExTrain = GetSummary(dataset,'extractive', model_name = 'paraphrase-MiniLM-L6-v2', test = False)
get_train_summary = ExTrain.get_summary(PARAMS)
ExTest = GetSummary(dataset,'extractive', model_name = 'paraphrase-MiniLM-L6-v2', test = True)
get_test_summary = ExTest.get_summary(PARAMS)

In [25]:
metric = load('rouge')
sent_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in get_test_summary]
sent_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in labels]
scores = metric.compute(predictions=sent_preds, references=sent_labels)
result = {key: value for key, value in scores.items()}

## Abstractive Summarization

## Model Name :  T-5

### a. Without Finetuning

In [None]:
name = 't5-small'
t5Pre = GetSummary(dataset,'abstractive', model_name = name, tokenizer_name = name,test = True)
t5_pretrained_summary = t5Pre.get_summary(PARAMS)
evaluate = Evaluate('rouge', name)
evaluate.compute_metrics(t5_pretrained_summary,labels)

### b. With Finetuning

In [None]:
#finetune
name = 't5-small'
final_name =  't5small-finetuned-scidcc'
myTrainer = Trainer(dataset,name,name, PARAMS,final_name)
t5_finetuned_summary, labels = myTrainer.run_trainer(save_model = False)
evaluate = Evaluate('rouge', final_name)
evaluate.compute_metrics(t5_finetuned_summary,labels)

In [None]:
# https://discuss.huggingface.co/t/t5-generates-very-short-summaries/277/21
# https://towardsdatascience.com/fine-tuning-a-t5-transformer-for-any-summarization-task-82334c64c81
name = 't5small-finetuned-scidcc'
t5Fin = GetSummary(dataset,'abstractive', model_name = name, tokenizer_name = name ,test = True, finetuned = True)
t5_finetuned_summary = t5Fin.get_summary(PARAMS)


## Model Name :  FLAN t-5 small

### a. Without Finetuning

In [None]:
name = "google/flan-t5-small"
flanPre = GetSummary(dataset,'abstractive', model_name = name, tokenizer_name = name ,test = True)
flan_pretrained_summary = flanPre.get_summary(PARAMS_FLAN)


In [None]:
evaluate = Evaluate('rouge', name)
evaluate.compute_metrics(flan_pretrained_summary,labels)

In [None]:
dataset = pD.read_data_huggingface('test_t5.csv')['train']
dataset = flanPre.add_column(flan_pretrained_summary,'flan-t5',dataset)
pD.save_dataset(dataset,'flan-t5-pre.csv')

### b. With Finetuning

In [None]:
#finetune
pD = processData(data_folder)
dataset = pD.read_data_huggingface('train_extractive.csv', 'test_extractive.csv')
name = "google/flan-t5-small"
final_name =  'flan-t5-small-finetuned'
myTrainer = Trainer(dataset,name,name, PARAMS_FLAN,final_name)
flant5_finetuned_summary, labels = myTrainer.run_trainer(save_model = True)
evaluate = Evaluate('rouge', final_name)
evaluate.compute_metrics(flant5_finetuned_summary,labels)

In [None]:
# Inference with max_new_length
pD = processData(data_folder)
dataset = pD.read_data_huggingface('train_extractive.csv', 'test_extractive.csv')
name = "flan-t5-small-finetuned"
flanFin = GetSummary(dataset,'abstractive', model_name = name, tokenizer_name = name ,test = True)
flant5_finetuned_summary = flanFin.get_summary(PARAMS_FLAN)

In [None]:
dataset = pD.read_data_huggingface('flan-t5-pre.csv')['train']
dataset =   dataset.add_column('flan-t5-finetuned', flant5_finetuned_summary)
pD.save_dataset(dataset,'flan-t5-fin.csv')

## Model - DistilBART-CNN-12-6
### Without Finetune

In [None]:
pD = processData(data_folder)
dataset = pD.read_data_huggingface(test_name = 'test_extractive.csv')
name = 'sshleifer/distilbart-cnn-12-6'
distilbartPre = GetSummary(dataset,'abstractive', model_name = name, tokenizer_name = name,test = True)
distilbart_pretrained_summary = distilbartPre.get_summary(PARAMS_FLAN)
evaluate = Evaluate('rouge', name)
evaluate.compute_metrics(distilbart_pretrained_summary,labels)

In [None]:
dataset = pD.read_data_huggingface('flan-t5-fin-v2.csv')['train']
dataset = distilbartPre.add_column(distilbart_pretrained_summary,'distilbart',dataset)
pD.save_dataset(dataset,'v3.csv')

### With Finetune

In [None]:
#finetune changed lr to 1e-5 from 5e-5 in FLAN 
torch.cuda.empty_cache()
pD = processData(data_folder)
dataset = pD.read_data_huggingface('train_extractive.csv', 'test_extractive.csv')
name ='sshleifer/distilbart-cnn-12-6'
final_name =  'distilbart-finetuned'
myTrainer = Trainer(dataset,name,name, PARAMS_BART,final_name)
distilbart_finetuned_summary, labels = myTrainer.run_trainer(save_model = True)
evaluate = Evaluate('rouge', final_name)
evaluate.compute_metrics(distilbart_finetuned_summary,labels)

In [None]:
# Inference with max_new_length
pD = processData(data_folder)
dataset = pD.read_data_huggingface('train_extractive.csv', 'test_extractive.csv')
name =  'distilbart-finetuned'
distilbartFin = GetSummary(dataset,'abstractive', model_name = name, tokenizer_name = name ,test = True)
distilbart_finetuned_summary = distilbartFin.get_summary(PARAMS_BART)

In [None]:
distilbart_finetuned_summary[6]

In [None]:
dataset = pD.read_data_huggingface(train_name = 'v3.csv')['train']
dataset =   dataset.add_column('distilbart-finetuned', distilbart_finetuned_summary)
pD.save_dataset(dataset,'v4.csv')

## Fact Verification

###  1. RoBERTa Trained on FEVER

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('Dzeniks/roberta-fact-check')
model = RobertaForSequenceClassification.from_pretrained('Dzeniks/roberta-fact-check')

In [None]:
from tqdm import tqdm
dataset = load_dataset("csv", data_files={"test": os.path.join(data_folder,"v5.csv")})

def get_fact_labels(model, tokenizer,claim_column):
    final_labels = [] 
    for i in tqdm(range(len(dataset['test']))):
        sample = dataset['test'][i]
        tokens = nltk.sent_tokenize(sample[claim_column])
        no_of_claims = len(tokens)
        evidence = sample['Extractive']
        label  = 0
        no_of_claims_processed = 0
        for claim in tokens:
            no_of_claims_processed +=1
            x = tokenizer.encode_plus(claim, evidence, return_tensors="pt")
            model.eval()
            with torch.no_grad():
                  prediction = model(**x)
            label += 1- torch.argmax(prediction[0]).item()
            if no_of_claims_processed >3:
                break
        div = no_of_claims if no_of_claims_processed > no_of_claims else no_of_claims_processed
        final_label = label/div
        final_labels.append(final_label)
    return final_labels

In [None]:
t5_facts = get_fact_labels(model, tokenizer,'t5-pretrained')
sum(t5_facts)/len(t5_facts)

In [None]:
distilbart_facts = get_fact_labels(model, tokenizer,'distilbart-finetuned')
sum(distilbart_facts)/len(distilbart_facts)

In [None]:
dataset = pD.read_data_huggingface(train_name = 'v4.csv')['train']
dataset =   dataset.add_column('distilbart-finetuned-labels', distilbart_facts)
pD.save_dataset(dataset,'v5.csv')


### 2. RoBERTa trained on ClimateFEVER

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)


In [None]:
pD = processData(data_folder)
# pD.process_climfever() #processes and saves the climfever dataset to run only once
dataset = pD.read_data_huggingface(train_name = 'climate-fever-processed.csv')
dataset = pD.train_test_split(dataset,no_of_train=725, no_of_test = 182)

In [None]:
def preprocess_function(batch):
    return tokenizer(batch["claim"], batch["evidence"], truncation=True, padding="max_length")

def compute_metrics(eval_pred):
    f1_score = load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_score.add_batch(predictions=predictions, references=labels)
    return f1_score.compute()

tokenized_data = dataset.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir="roberta-climfever",  # output directory
    num_train_epochs=5,  # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,  # batch size for evaluation
#     warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    learning_rate=2e-5,  # learning rate
    save_total_limit=2,  # limit the total amount of checkpoints, delete the older checkpoints
#     logging_dir="./logs",  # directory for storing logs
#     logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="no",
)
 
climfeverTrainer = HFTrainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=tokenized_data["train"],  # training dataset
    eval_dataset=tokenized_data["test"],  # evaluation dataset
    compute_metrics=compute_metrics,  # the callback that computes metrics of interest
)
climfeverTrainer.train()

In [None]:
model.save_pretrained("roberta-climfever", from_pt=True) 
tokenizer.save_pretrained("roberta-climfever")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-climfever")
model = AutoModelForSequenceClassification.from_pretrained("roberta-climfever", num_labels=2)
roberta_climfever_facts = get_fact_labels(model, tokenizer,'distilbart-finetuned')

In [None]:
sum(roberta_climfever_facts)/50

In [None]:
pD = processData(data_folder)
dataset = pD.read_data_huggingface(train_name = 'v5.csv')['train']
dataset =   dataset.add_column('distilbart-finetuned-labels-roberta-climfever', roberta_climfever_facts)
pD.save_dataset(dataset,'v6.csv')

### 3. ClimateBERT trained on ClimateFEVER


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("amandakonet/climatebert-fact-checking")
tokenizer = AutoTokenizer.from_pretrained("amandakonet/climatebert-fact-checking")
from tqdm import tqdm
dataset = load_dataset("csv", data_files={"test": os.path.join(data_folder,"v5.csv")})
def get_fact_labels_climfever(model, tokenizer,claim_column):
    final_labels = [] 
    for i in tqdm(range(len(dataset['test']))):
        sample = dataset['test'][i]
        tokens = nltk.sent_tokenize(sample[claim_column])
        no_of_claims = len(tokens)
        evidence = sample['Extractive']
        label_count  = 0
        no_of_claims_processed = 0
        for claim in tokens:
#             print(f"Claim: {claim} \n Evidence : {evidence}")
            no_of_claims_processed +=1
            x = tokenizer(claim, evidence, return_tensors="pt",\
                                      padding='max_length', truncation=True, max_length=512)
            model.eval()
            with torch.no_grad():
                scores = model(**x).logits
                label_mapping = ['support', 'refute', 'neutral']
                label = scores.argmax(dim=1).item()
                if label == 2:
                    label = np.argmax(scores[0][:2]).item()
                label_count += 1 - label
                if no_of_claims_processed >3:
                    break
        div = no_of_claims if no_of_claims_processed > no_of_claims else no_of_claims_processed
        final_label = label_count/div
        final_labels.append(final_label)
    return final_labels


climbert_facts = get_fact_labels_climfever(model, tokenizer,'distilbart-finetuned')

In [None]:
sum(climbert_facts)/len(climbert_facts)

In [None]:
pD = processData(data_folder)
dataset = pD.read_data_huggingface(train_name = 'v5.csv')['train']
dataset =   dataset.add_column('distilbart-finetuned-labels-climbert', climbert_facts)
pD.save_dataset(dataset,'v5.csv')
