## triplet

In [1]:
import json

import json

with open("dataset/rag_truth_train.json", "r") as f:
    train_data = json.load(f)
with open("dataset/rag_truth_dev.json", "r") as f:
    dev_data = json.load(f)
with open("dataset/rag_truth_test.json", "r") as f:
    test_data = json.load(f)

In [2]:
def add_prefix(data):
    for d in data:
        d["text"] = "Please judge the following statement whether it includes hallucination or not based on the references above: " + d["text"]
    return data

train_data = add_prefix(train_data)
dev_data = add_prefix(dev_data)
test_data = add_prefix(test_data)

In [31]:
# task_type: QA, Data2txt, Summary
task_name = "Data2txt"
train_data = [d for d in train_data if d["task_type"] == task_name]
dev_data = [d for d in dev_data if d["task_type"] == task_name]
test_data = [d for d in test_data if d["task_type"] == task_name]

In [3]:
import random

def create_trip(data, id_list):
    trip = []
    for id in id_list:
        num = 0
        no_hal = []
        has_hal = []
        for d in data:
            if num == 6:
                num = 0
                if no_hal == [] or has_hal == []:
                    break
                # shuffle
                random.seed(id) # change seed
                no_hal = random.sample(no_hal, len(no_hal))
                has_hal = random.sample(has_hal, len(has_hal))
                
                if len(no_hal)==1 or len(no_hal)==5:
                    trip.append({"anchor":ref,"positive": no_hal[0], "negative": has_hal[0], "labels": 0}) # labels are dummy 
                elif len(no_hal)==2 or len(no_hal)==4:
                    trip.append({"anchor":ref,"positive": no_hal[0], "negative": has_hal[0], "labels": 0})
                    trip.append({"anchor":ref,"positive": no_hal[1], "negative": has_hal[1], "labels": 0})
                elif len(no_hal)==3:
                    trip.append({"anchor":ref,"positive": no_hal[0], "negative": has_hal[0], "labels": 0})
                    trip.append({"anchor":ref,"positive": no_hal[1], "negative": has_hal[1], "labels": 0})
                    trip.append({"anchor":ref,"positive": no_hal[2], "negative": has_hal[2], "labels": 0})
                no_hal = []
                has_hal = []
                break
            elif d["source_id"] == id:
                num +=1
                ref = d["ref"]
                if d["labels"] == 0:
                    no_hal.append(d["text"])
                else: #hallucination
                    has_hal.append(d["text"])
        if num == 6:
            if len(no_hal)==1 or len(no_hal)==5:
                trip.append({"anchor":ref,"positive": no_hal[0], "negative": has_hal[0], "labels": 0})
            elif len(no_hal)==2 or len(no_hal)==4:
                trip.append({"anchor":ref,"positive": no_hal[0], "negative": has_hal[0], "labels": 0})
                trip.append({"anchor":ref,"positive": no_hal[1], "negative": has_hal[1], "labels": 0})
            elif len(no_hal)==3:
                trip.append({"anchor":ref,"positive": no_hal[0], "negative": has_hal[0], "labels": 0})
                trip.append({"anchor":ref,"positive": no_hal[1], "negative": has_hal[1], "labels": 0})
                trip.append({"anchor":ref,"positive": no_hal[2], "negative": has_hal[2], "labels": 0})
    return trip

In [4]:
train_id = [d["source_id"] for d in train_data]
train_id = list(set(train_id))
dev_id = [d["source_id"] for d in dev_data]
dev_id = list(set(dev_id))
test_id = [d["source_id"] for d in test_data]
test_id = list(set(test_id))
print(len(train_id), len(dev_id), len(test_id))
train_trip = create_trip(train_data, train_id)
dev_trip = create_trip(dev_data, dev_id)
test_trip = create_trip(test_data, test_id)

2305 210 450


In [5]:
len(train_trip), len(dev_trip), len(test_trip)

(3760, 337, 633)

In [6]:
from datasets import Dataset, DatasetDict
import pandas as pd

train_df = pd.DataFrame(train_trip)
dev_df = pd.DataFrame(dev_trip)
test_df = pd.DataFrame(test_trip)

train_ds = Dataset.from_pandas(train_df)
dev_ds = Dataset.from_pandas(dev_df)
test_ds = Dataset.from_pandas(test_df)

tri_raw_datasets = DatasetDict({"train": train_ds, "dev": dev_ds, "test": test_ds})
tri_raw_datasets

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['anchor', 'positive', 'negative', 'labels'],
        num_rows: 3760
    })
    dev: Dataset({
        features: ['anchor', 'positive', 'negative', 'labels'],
        num_rows: 337
    })
    test: Dataset({
        features: ['anchor', 'positive', 'negative', 'labels'],
        num_rows: 633
    })
})

In [7]:
from transformers import AutoTokenizer

tri_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/RoBERTa-base")


In [8]:
tri_raw_datasets["train"][0]

{'anchor': 'Dongguan, China (CNN)For a decade, the New South China Mall -- the biggest shopping mall in the world -- has been an embarrassment for its owners and China. Opened to the public in 2005 in Dongguan in the south of the country, the goal was to attract 100,000 visitors a day with an array of entertainment, shops and eateries. Outside the mall, a giant Egyptian sphinx and a replica of the Arc de Triomphe were erected alongside fountains and canals complete with Venetian gondolas. It even boasted an indoor roller coaster. But despite the grand plans neither stores nor shoppers came. Soon, it was classified by industry analysts as a "dead mall" and it became an unflattering symbol of China\'s runaway speculation on real estate projects. The mall spans five million square feet of shopping area, making it the largest in the world in terms of leasable space -- more than twice the size of Mall of America, the biggest shopping center in the United States. When I visited two years ago

In [9]:
from transformers import DataCollatorWithPadding

def tri_tokenize_function(examples):
    anchor = tri_tokenizer(examples["anchor"], truncation=True,max_length=512,padding="max_length")
    positive = tri_tokenizer(examples["positive"], truncation=True,max_length=512,padding="max_length")
    negative = tri_tokenizer(examples["negative"], truncation=True,max_length=512,padding="max_length")

    return {
        "anchor_input_ids": anchor["input_ids"],
        "anchor_attention_mask": anchor["attention_mask"],
        "positive_input_ids": positive["input_ids"],
        "positive_attention_mask": positive["attention_mask"],
        "negative_input_ids": negative["input_ids"],
        "negative_attention_mask": negative["attention_mask"],
    }

tri_tokenized_datasets = tri_raw_datasets.map(tri_tokenize_function, batched=True)
tri_tokenized_datasets = tri_tokenized_datasets.remove_columns(["anchor", "positive", "negative"])
tri_data_collator = DataCollatorWithPadding(tokenizer=tri_tokenizer)

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map: 100%|██████████| 3760/3760 [00:02<00:00, 1733.64 examples/s]
Map: 100%|██████████| 337/337 [00:00<00:00, 1829.40 examples/s]
Map: 100%|██████████| 633/633 [00:00<00:00, 1845.27 examples/s]


In [10]:
from transformers import DataCollatorWithPadding
from torch.nn.utils.rnn import pad_sequence
import torch

class CustomDataCollator(DataCollatorWithPadding):
    def __call__(self, features):
        anchor_ids = [torch.tensor(x['anchor_input_ids']) for x in features]
        positive_ids = [torch.tensor(x['positive_input_ids']) for x in features]
        negative_ids = [torch.tensor(x['negative_input_ids']) for x in features]
        
        anchor_mask = [torch.tensor(x['anchor_attention_mask']) for x in features]
        positive_mask = [torch.tensor(x['positive_attention_mask']) for x in features]
        negative_mask = [torch.tensor(x['negative_attention_mask']) for x in features]
        
        anchor_ids = pad_sequence(anchor_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        positive_ids = pad_sequence(positive_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        negative_ids = pad_sequence(negative_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        
        anchor_mask = pad_sequence(anchor_mask, batch_first=True, padding_value=0)
        positive_mask = pad_sequence(positive_mask, batch_first=True, padding_value=0)
        negative_mask = pad_sequence(negative_mask, batch_first=True, padding_value=0)
        
        labels = [torch.tensor(x['labels']) for x in features]
        
        batch = {
            "input_ids": [anchor_ids, positive_ids, negative_ids],
            "attention_mask": [anchor_mask, positive_mask, negative_mask],
            "labels": labels
        }
        
        return batch


tri_data_collator = CustomDataCollator(tokenizer=tri_tokenizer)

In [None]:
from transformers import AutoModel


base_model = AutoModel.from_pretrained("FacebookAI/RoBERTa-base")

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
base_model.to(device)
device

In [13]:
import torch.nn.functional as F
import torch.nn as nn

def triplet_loss(anchor_output, positive_output, negative_output, positive_logits, negative_logits):
    positive_targets = torch.zeros(positive_output.size(0), dtype=torch.long).to(device)  # no hallucination = 0
    negative_targets = torch.ones(negative_output.size(0), dtype=torch.long).to(device)
    positive_loss = nn.CrossEntropyLoss()(positive_logits, positive_targets)
    negative_loss = nn.CrossEntropyLoss()(negative_logits, negative_targets)

    classification_loss = (positive_loss + negative_loss) / 2.0 # average

    triplet_loss_fn = (nn.TripletMarginWithDistanceLoss(margin=1.0,distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y)))
    triplet_loss = triplet_loss_fn(anchor_output, positive_output, negative_output)
   
    return classification_loss, triplet_loss

In [14]:
import torch
import numpy as np

def compute_tri_metrics(eval_pred):
    logits,labels= eval_pred
    logits = logits[0]
   
    # ModelOutput(logits=[positive_logits, negative_logits]...)
    positive_logits = torch.tensor(logits[0])
    negative_logits = torch.tensor(logits[1])

    positive_preds = torch.argmax(positive_logits, dim=1)
    negative_preds = torch.argmax(negative_logits, dim=1)

    # num of correct predictions
    correct_positive = (positive_preds == 0).sum().item()
    correct_negative = (negative_preds == 1).sum().item()
    
    # total samples
    total_samples = positive_preds.size(0) + negative_preds.size(0)
    
    positive_preds_num = (positive_preds == 1).sum().item() # predict non-hallucination samples (positive) as hallucination
    negative_preds_num = (negative_preds == 1).sum().item() # predict hallucination samples (negative) as hallucination
    negative_num = negative_preds.size(0) # num of hallucination samples

    precision = negative_preds_num / (positive_preds_num + negative_preds_num) if (positive_preds_num + negative_preds_num) > 0 else 0
    recall = negative_preds_num / negative_num if negative_num > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    accuracy = (correct_positive + correct_negative) / total_samples
    return {
        "accuracy": accuracy,
        "recall": recall,
        "precision": precision,
        "f1": f1
    }
    


In [None]:
from transformers import TrainingArguments
from transformers import Trainer
import torch
from models_rob import TripletModel

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="steps",  
    save_steps=10000,
    learning_rate=5e-6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16 = True,
    gradient_accumulation_steps=3,
    logging_dir="./logs",
    remove_unused_columns=False,
    report_to="tensorboard",
)

tri_model = TripletModel(base_model, triplet_loss)
trainer = Trainer(
    model=tri_model,
    args=training_args,
    train_dataset=tri_tokenized_datasets["train"],
    eval_dataset=tri_tokenized_datasets["dev"],
    data_collator=tri_data_collator,
    tokenizer=tri_tokenizer,
    compute_metrics=compute_tri_metrics,
)

In [16]:
trainer.evaluate()

{'eval_loss': 1.7031710147857666,
 'eval_model_preparation_time': 0.0022,
 'eval_accuracy': 0.5,
 'eval_recall': 0.0,
 'eval_precision': 0,
 'eval_f1': 0,
 'eval_runtime': 2.2992,
 'eval_samples_per_second': 146.573,
 'eval_steps_per_second': 36.969}

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,Recall,Precision,F1
1,1.4877,1.170633,0.0022,0.70178,0.857567,0.653846,0.741977
2,1.1169,1.196219,0.0022,0.692878,0.937685,0.629482,0.753278
3,1.025,1.115803,0.0022,0.709199,0.928783,0.645361,0.761557
4,0.9656,1.049453,0.0022,0.72997,0.916914,0.667387,0.7725
5,0.9096,1.140414,0.0022,0.698813,0.922849,0.637295,0.753939
6,0.8519,1.099777,0.0022,0.715134,0.925816,0.651357,0.764706
7,0.8258,1.133316,0.0022,0.710682,0.928783,0.646694,0.762485
8,0.7922,1.15915,0.0022,0.697329,0.931751,0.634343,0.754808
9,0.7354,1.170486,0.0022,0.703264,0.928783,0.640082,0.757869


TrainOutput(global_step=3130, training_loss=0.947699827736559, metrics={'train_runtime': 673.2048, 'train_samples_per_second': 55.852, 'train_steps_per_second': 4.649, 'total_flos': 0.0, 'train_loss': 0.947699827736559, 'epoch': 9.970212765957447})

In [18]:
trainer.evaluate(eval_dataset=tri_tokenized_datasets["test"])

{'eval_loss': 1.243459939956665,
 'eval_model_preparation_time': 0.0022,
 'eval_accuracy': 0.6951026856240127,
 'eval_recall': 0.9462875197472354,
 'eval_precision': 0.629863301787592,
 'eval_f1': 0.7563131313131312,
 'eval_runtime': 3.4686,
 'eval_samples_per_second': 182.494,
 'eval_steps_per_second': 45.84,
 'epoch': 9.970212765957447}

In [19]:
def create_dev_task(name):
    dev_data2 = [d for d in test_data if d["task_type"] == name]
    dev_id2 = [d["source_id"] for d in dev_data2]
    dev_id2 = list(set(dev_id2))
    dev_trip2 = create_trip(dev_data2, dev_id2)
    dev_df2 = pd.DataFrame(dev_trip2)
    dev_ds2 = Dataset.from_pandas(dev_df2)
    tri_tokenized_datasets_task = dev_ds2.map(tri_tokenize_function, batched=True)
    tri_tokenized_datasets_task = tri_tokenized_datasets_task.remove_columns(["anchor", "positive", "negative"])
    tri_tokenized_datasets_task.set_format("torch")
    return tri_tokenized_datasets_task

In [20]:
dev_qa = create_dev_task("QA")
trainer.evaluate(eval_dataset=dev_qa)

Map: 100%|██████████| 150/150 [00:00<00:00, 1938.35 examples/s]
  anchor_ids = [torch.tensor(x['anchor_input_ids']) for x in features]
  positive_ids = [torch.tensor(x['positive_input_ids']) for x in features]
  negative_ids = [torch.tensor(x['negative_input_ids']) for x in features]
  anchor_mask = [torch.tensor(x['anchor_attention_mask']) for x in features]
  positive_mask = [torch.tensor(x['positive_attention_mask']) for x in features]
  negative_mask = [torch.tensor(x['negative_attention_mask']) for x in features]
  labels = [torch.tensor(x['labels']) for x in features]


{'eval_loss': 1.1681417226791382,
 'eval_model_preparation_time': 0.0022,
 'eval_accuracy': 0.69,
 'eval_recall': 0.9466666666666667,
 'eval_precision': 0.6255506607929515,
 'eval_f1': 0.753315649867374,
 'eval_runtime': 0.7384,
 'eval_samples_per_second': 203.148,
 'eval_steps_per_second': 51.464,
 'epoch': 9.970212765957447}

In [21]:
dev_d2t = create_dev_task("Data2txt")
trainer.evaluate(eval_dataset=dev_d2t)

Map:   0%|          | 0/291 [00:00<?, ? examples/s]

Map: 100%|██████████| 291/291 [00:00<00:00, 1746.61 examples/s]
  anchor_ids = [torch.tensor(x['anchor_input_ids']) for x in features]
  positive_ids = [torch.tensor(x['positive_input_ids']) for x in features]
  negative_ids = [torch.tensor(x['negative_input_ids']) for x in features]
  anchor_mask = [torch.tensor(x['anchor_attention_mask']) for x in features]
  positive_mask = [torch.tensor(x['positive_attention_mask']) for x in features]
  negative_mask = [torch.tensor(x['negative_attention_mask']) for x in features]
  labels = [torch.tensor(x['labels']) for x in features]


{'eval_loss': 0.9186015725135803,
 'eval_model_preparation_time': 0.0022,
 'eval_accuracy': 0.7972508591065293,
 'eval_recall': 0.9381443298969072,
 'eval_precision': 0.7319034852546917,
 'eval_f1': 0.822289156626506,
 'eval_runtime': 1.412,
 'eval_samples_per_second': 206.093,
 'eval_steps_per_second': 51.7,
 'epoch': 9.970212765957447}

In [22]:
dev_sum = create_dev_task("Summary")
trainer.evaluate(eval_dataset=dev_sum)

Map: 100%|██████████| 192/192 [00:00<00:00, 1731.77 examples/s]


  anchor_ids = [torch.tensor(x['anchor_input_ids']) for x in features]
  positive_ids = [torch.tensor(x['positive_input_ids']) for x in features]
  negative_ids = [torch.tensor(x['negative_input_ids']) for x in features]
  anchor_mask = [torch.tensor(x['anchor_attention_mask']) for x in features]
  positive_mask = [torch.tensor(x['positive_attention_mask']) for x in features]
  negative_mask = [torch.tensor(x['negative_attention_mask']) for x in features]
  labels = [torch.tensor(x['labels']) for x in features]


{'eval_loss': 1.7985296249389648,
 'eval_model_preparation_time': 0.0022,
 'eval_accuracy': 0.5442708333333334,
 'eval_recall': 0.9583333333333334,
 'eval_precision': 0.5242165242165242,
 'eval_f1': 0.6777163904235729,
 'eval_runtime': 0.9306,
 'eval_samples_per_second': 206.318,
 'eval_steps_per_second': 51.58,
 'epoch': 9.970212765957447}

In [23]:
name = "./trained/triplet_rob"
trainer.save_model(name)
trainer.save_state()
tri_model.save_pretrained(name)