In [1]:
import json

with open("../dataset/rag_truth_train.json", "r") as f:
    train_data = json.load(f)
with open("../dataset/rag_truth_dev.json", "r") as f:
    dev_data = json.load(f)
with open("../dataset/rag_truth_test.json", "r") as f:
    test_data = json.load(f)

In [2]:
def add_prefix(data):
    for d in data:
        d["text"] = "Please judge the following statement whether it includes hallucination or not based on the Document above: " + d["text"]
    return data

train_data = add_prefix(train_data)
dev_data = add_prefix(dev_data)
test_data = add_prefix(test_data)

In [3]:
# task_type: QA, Data2txt, Summary
task_name = "Summary"
train_data = [d for d in train_data if d["task_type"] == task_name]
dev_data = [d for d in dev_data if d["task_type"] == task_name]
test_data = [d for d in test_data if d["task_type"] == task_name]

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd

train_df = pd.DataFrame(train_data)
dev_df = pd.DataFrame(dev_data)
test_df = pd.DataFrame(test_data)
train_ds = Dataset.from_pandas(train_df)
dev_ds = Dataset.from_pandas(dev_df)
test_ds = Dataset.from_pandas(test_df)

raw_datasets = DatasetDict({"train": train_ds, "dev":dev_ds, "test": test_ds})
raw_datasets

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding


tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
tokenizer.padding_side = "right"

def tokenize_function(examples):
    ref = tokenizer(examples["ref"],truncation=True, max_length=512)
    text = tokenizer(examples["text"],truncation=True, max_length=512)
    return {
        "ref_input_ids":ref["input_ids"],
        "ref_attention_mask":ref["attention_mask"],
        "text_input_ids":text["input_ids"],
        "text_attention_mask":text["attention_mask"],
    }

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text","ref"])
#tokenized_datasets.set_format("torch")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
from transformers import DataCollatorWithPadding
from torch.nn.utils.rnn import pad_sequence
import torch

class CustomDataCollator(DataCollatorWithPadding):
    def __call__(self, features):
        # features: [{'anchor_input_ids': ..., 'anchor_attention_mask': ..., ...}, ...]

        ref_ids = [torch.tensor(x['ref_input_ids']) for x in features]
        text_ids = [torch.tensor(x['text_input_ids']) for x in features]
       
        
        ref_mask = [torch.tensor(x['ref_attention_mask']) for x in features]
        text_mask = [torch.tensor(x['text_attention_mask']) for x in features]
        labels = torch.tensor([x['labels'] for x in features])

        ref_ids = pad_sequence(ref_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        text_ids = pad_sequence(text_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        
        ref_mask = pad_sequence(ref_mask, batch_first=True, padding_value=0)
        text_mask = pad_sequence(text_mask, batch_first=True, padding_value=0)
        
        batch = {
            "input_ids": [ref_ids, text_ids],
            "attention_mask": [ref_mask, text_mask],
            "labels": labels
        }
        
        return batch


data_collator = CustomDataCollator(tokenizer=tokenizer)

In [6]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'source', 'model', 'task_type', 'source_id', 'ref_input_ids', 'ref_attention_mask', 'text_input_ids', 'text_attention_mask'],
        num_rows: 13830
    })
    dev: Dataset({
        features: ['labels', 'source', 'model', 'task_type', 'source_id', 'ref_input_ids', 'ref_attention_mask', 'text_input_ids', 'text_attention_mask'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['labels', 'source', 'model', 'task_type', 'source_id', 'ref_input_ids', 'ref_attention_mask', 'text_input_ids', 'text_attention_mask'],
        num_rows: 2208
    })
})

In [7]:
from transformers import AutoModel

base_model = AutoModel.from_pretrained("microsoft/Phi-3.5-mini-instruct")

Loading checkpoint shards: 100%|██████████| 2/2 [00:32<00:00, 16.17s/it]


In [9]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
base_model.to(device)
device

device(type='cuda')

In [10]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    logits=logits
    predictions = np.argmax(logits, axis=-1).tolist() 
    labels = labels.tolist()  
    
    accuracy = accuracy_score(labels, predictions)
    recall = recall_score(labels, predictions)
    precision = precision_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    
    return {
        "accuracy": accuracy,
        "recall": recall,
        "precision": precision,
        "f1": f1
    }

In [None]:
from transformers import TrainingArguments, Trainer
from models.models_phi import ClassifierModel

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="steps",  
    save_steps=10000,
    learning_rate=1e-6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16 = True,
    gradient_accumulation_steps=12,
    logging_dir="./logs",
    remove_unused_columns=False,
    optim="adafactor",
    report_to = [],
)

model = ClassifierModel(base_model)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(eval_dataset=tokenized_datasets["test"])

In [None]:
# save model
import os

name = "../trained_model/classifier_phi"
trainer.save_model(name)
trainer.save_state()
model.save_pretrained(name)

### Inference and save

In [None]:
# if you restart kernel, you need to load the model and tokenizer again.
from transformers import AutoModel, AutoTokenizer, DataCollatorWithPadding
import torch
from models.models_phi import ClassifierModel

base_model = AutoModel.from_pretrained("microsoft/Phi-3.5-mini-instruct")
# load model and tokenizer
name = "../trained_models/classifier_phi"
model = ClassifierModel.from_pretrained(base_model,name)
tokenizer = AutoTokenizer.from_pretrained(name)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

def tokenize_function(examples):
    ref = tokenizer(examples["ref"],truncation=True, max_length=512)
    text = tokenizer(examples["text"],truncation=True, max_length=512)
    return {
        "ref_input_ids":ref["input_ids"],
        "ref_attention_mask":ref["attention_mask"],
        "text_input_ids":text["input_ids"],
        "text_attention_mask":text["attention_mask"],
    }

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text","ref"])
#tokenized_datasets.set_format("torch")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import os

if os.path.exists("../results_test.json"):
    with open("../results_test.json", "r") as f:
        data = json.load(f)
else:
    data = []
    for i, d in enumerate(test_data):
        data.append({"id": i, "label": d["labels"], "task": d["task_type"]})

len(data)

In [None]:
import torch
from tqdm import tqdm
model.eval()


for i,d in tqdm(enumerate(tokenized_datasets["test"])):
    ref_input_ids = torch.tensor(d["ref_input_ids"]).unsqueeze(0).to(device)
    text_input_ids = torch.tensor(d["text_input_ids"]).unsqueeze(0).to(device)
    input_ids = [ref_input_ids, text_input_ids]
    ref_attention_mask = torch.tensor(d["ref_attention_mask"]).unsqueeze(0).to(device)
    text_attention_mask = torch.tensor(d["text_attention_mask"]).unsqueeze(0).to(device)
    attention_mask = [ref_attention_mask, text_attention_mask]
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs["logits"]
        predicted_index = torch.argmax(logits, dim=-1)
    data[i]["cls_phi_logits"] = logits.cpu().numpy()[0]
    data[i]["cls_phi_label"] = predicted_index.cpu().numpy()[0]

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from collections import Counter


true_labels = [item['label'] for item in data]
predicted_labels = [item['cls_phi_label'] for item in data]

count = Counter(predicted_labels)
print(count)

accuracy_score(true_labels, predicted_labels), f1_score(true_labels, predicted_labels), precision_score(true_labels, predicted_labels), recall_score(true_labels, predicted_labels)

In [None]:
with open("../results_test.json", "w") as f:
    json.dump(data, f)