## no_doc

In [18]:
import json

with open("dataset/rag_truth_train.json", "r") as f:
    train_data = json.load(f)
with open("dataset/rag_truth_dev.json", "r") as f:
    dev_data = json.load(f)
with open("dataset/rag_truth_test.json", "r") as f:
    test_data = json.load(f)

In [19]:
def add_prefix(data):
    for d in data:
        d["text"] = "Please judge the following statement whether it includes hallucination or not: " + d["text"]
    return data


train_data = add_prefix(train_data)
dev_data = add_prefix(dev_data)
test_data = add_prefix(test_data)

In [16]:
# task_type: QA, Data2txt, Summary
# Run when you want to train only on specific tasks
task_name = "Summary"
train_data = [d for d in train_data if d["task_type"] == task_name]
dev_data = [d for d in dev_data if d["task_type"] == task_name]
test_data = [d for d in test_data if d["task_type"] == task_name]

In [20]:
from datasets import Dataset, DatasetDict
import pandas as pd

train_df = pd.DataFrame(train_data)
dev_df = pd.DataFrame(dev_data)
test_df = pd.DataFrame(test_data)
train_ds = Dataset.from_pandas(train_df)
dev_ds = Dataset.from_pandas(dev_df)
test_ds = Dataset.from_pandas(test_df)

raw_datasets = DatasetDict({"train": train_ds, "dev":dev_ds, "test": test_ds})
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['ref', 'text', 'labels', 'source', 'model', 'task_type', 'source_id'],
        num_rows: 13830
    })
    dev: Dataset({
        features: ['ref', 'text', 'labels', 'source', 'model', 'task_type', 'source_id'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['ref', 'text', 'labels', 'source', 'model', 'task_type', 'source_id'],
        num_rows: 2700
    })
})

In [21]:
from transformers import AutoTokenizer, DataCollatorWithPadding


tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 13830/13830 [00:01<00:00, 11554.91 examples/s]
Map: 100%|██████████| 1260/1260 [00:00<00:00, 11916.25 examples/s]
Map: 100%|██████████| 2700/2700 [00:00<00:00, 12444.99 examples/s]


In [None]:
from transformers import AutoModel

base_model = AutoModel.from_pretrained("FacebookAI/RoBERTa-base")


In [23]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
base_model.to(device)
device

device(type='cuda')

In [24]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    logits = logits[0]
    predictions = np.argmax(logits, axis=-1).tolist()  
    labels = labels.tolist()  

    accuracy = accuracy_score(labels, predictions)
    recall = recall_score(labels, predictions)
    precision = precision_score(labels, predictions)
    f1 = f1_score(labels, predictions)

    return {"accuracy": accuracy, "recall": recall, "precision": precision, "f1": f1}

In [None]:
from transformers import TrainingArguments, Trainer
import torch
from models_rob import NoDocModel

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="steps",
    save_steps=10000,
    learning_rate=5e-6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=12,
    logging_dir="./logs",
    report_to="tensorboard",
)

model = NoDocModel(base_model)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [26]:
trainer.evaluate()

{'eval_loss': 0.6966366767883301,
 'eval_model_preparation_time': 0.0024,
 'eval_accuracy': 0.4238095238095238,
 'eval_recall': 0.9981308411214953,
 'eval_precision': 0.42414614773629866,
 'eval_f1': 0.5953177257525084,
 'eval_runtime': 2.4374,
 'eval_samples_per_second': 516.951,
 'eval_steps_per_second': 129.238}

In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,Recall,Precision,F1
1,0.5839,0.556482,0.0024,0.707937,0.719626,0.638474,0.676626
2,0.5217,0.502189,0.0024,0.766667,0.637383,0.773243,0.69877
3,0.5029,0.526865,0.0024,0.745238,0.725234,0.690391,0.707384
4,0.4814,0.590561,0.0024,0.668254,0.831776,0.575679,0.680428
5,0.4626,0.541642,0.0024,0.71746,0.770093,0.63876,0.698305
6,0.4465,0.598241,0.0024,0.688889,0.831776,0.595716,0.694228
7,0.4273,0.646027,0.0024,0.663492,0.859813,0.568603,0.684524
8,0.4105,0.594088,0.0024,0.703968,0.801869,0.616379,0.696994
9,0.3915,0.679329,0.0024,0.664286,0.857944,0.569479,0.684564


TrainOutput(global_step=2880, training_loss=0.46301840941111244, metrics={'train_runtime': 768.2196, 'train_samples_per_second': 180.027, 'train_steps_per_second': 3.749, 'total_flos': 0.0, 'train_loss': 0.46301840941111244, 'epoch': 9.96818970503181})

In [28]:
trainer.evaluate(eval_dataset=tokenized_datasets["test"])

{'eval_loss': 0.7494223713874817,
 'eval_model_preparation_time': 0.0024,
 'eval_accuracy': 0.6511111111111111,
 'eval_recall': 0.8568398727465536,
 'eval_precision': 0.5003095975232198,
 'eval_f1': 0.6317435496481626,
 'eval_runtime': 5.2273,
 'eval_samples_per_second': 516.515,
 'eval_steps_per_second': 129.129,
 'epoch': 9.96818970503181}

In [29]:
# evaluate on each task
def create_dev_task(name):
    dev_data2 = [d for d in test_data if d["task_type"] == name]
    dev_df2 = pd.DataFrame(dev_data2)
    dev_ds2 = Dataset.from_pandas(dev_df2)
    tokenized_datasets_task = dev_ds2.map(tokenize_function, batched=True)
    tokenized_datasets_task = tokenized_datasets_task.remove_columns(["text"])
    return tokenized_datasets_task

In [30]:
dev_qa = create_dev_task("QA")
trainer.evaluate(eval_dataset=dev_qa)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map: 100%|██████████| 900/900 [00:00<00:00, 8884.67 examples/s]


{'eval_loss': 0.8223901391029358,
 'eval_model_preparation_time': 0.0024,
 'eval_accuracy': 0.6244444444444445,
 'eval_recall': 0.8125,
 'eval_precision': 0.2968036529680365,
 'eval_f1': 0.43478260869565216,
 'eval_runtime': 2.2346,
 'eval_samples_per_second': 402.762,
 'eval_steps_per_second': 100.691,
 'epoch': 9.96818970503181}

In [31]:
dev_d2t = create_dev_task("Data2txt")
trainer.evaluate(eval_dataset=dev_d2t)

Map: 100%|██████████| 900/900 [00:00<00:00, 10057.10 examples/s]


{'eval_loss': 0.6488045454025269,
 'eval_model_preparation_time': 0.0024,
 'eval_accuracy': 0.7544444444444445,
 'eval_recall': 0.9499136442141624,
 'eval_precision': 0.7412398921832885,
 'eval_f1': 0.8327024981074943,
 'eval_runtime': 2.1812,
 'eval_samples_per_second': 412.626,
 'eval_steps_per_second': 103.157,
 'epoch': 9.96818970503181}

In [32]:
dev_sum = create_dev_task("Summary")
trainer.evaluate(eval_dataset=dev_sum)

Map: 100%|██████████| 900/900 [00:00<00:00, 13057.60 examples/s]


{'eval_loss': 0.7770723700523376,
 'eval_model_preparation_time': 0.0024,
 'eval_accuracy': 0.5744444444444444,
 'eval_recall': 0.6274509803921569,
 'eval_precision': 0.2942528735632184,
 'eval_f1': 0.40062597809076683,
 'eval_runtime': 2.2557,
 'eval_samples_per_second': 398.995,
 'eval_steps_per_second': 99.749,
 'epoch': 9.96818970503181}

In [33]:
# save_model
name = "./trained/no_doc_rob"
trainer.save_model(name)
trainer.save_state()
model.save_pretrained(name)