## with_doc

In [4]:
import json

with open("dataset/rag_truth_train.json", "r") as f:
    train_data = json.load(f)
with open("dataset/rag_truth_dev.json", "r") as f:
    dev_data = json.load(f)
with open("dataset/rag_truth_test.json", "r") as f:
    test_data = json.load(f)

In [5]:
def add_prefix(data):
    for d in data:
        d["text"] = "Please judge the following statement whether it includes hallucination or not based on the references above: " + d["text"]
    return data

train_data = add_prefix(train_data)
dev_data = add_prefix(dev_data)
test_data = add_prefix(test_data)

In [3]:
# task_type: QA, Data2txt, Summary
task_name = "Summary"
train_data = [d for d in train_data if d["task_type"] == task_name]
dev_data = [d for d in dev_data if d["task_type"] == task_name]
test_data = [d for d in test_data if d["task_type"] == task_name]

In [6]:
from datasets import Dataset, DatasetDict
import pandas as pd

train_df = pd.DataFrame(train_data)
dev_df = pd.DataFrame(dev_data)
test_df = pd.DataFrame(test_data)
train_ds = Dataset.from_pandas(train_df)
dev_ds = Dataset.from_pandas(dev_df)
test_ds = Dataset.from_pandas(test_df)

raw_datasets = DatasetDict({"train": train_ds, "dev":dev_ds, "test": test_ds})
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['ref', 'text', 'labels', 'source', 'model', 'task_type', 'source_id'],
        num_rows: 13830
    })
    dev: Dataset({
        features: ['ref', 'text', 'labels', 'source', 'model', 'task_type', 'source_id'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['ref', 'text', 'labels', 'source', 'model', 'task_type', 'source_id'],
        num_rows: 2700
    })
})

In [7]:
from transformers import AutoTokenizer, DataCollatorWithPadding


tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

def tokenize_function(examples):
    ref = tokenizer(examples["ref"],truncation=True, max_length=512)
    text = tokenizer(examples["text"],truncation=True, max_length=512)
    return {
        "ref_input_ids":ref["input_ids"],
        "ref_attention_mask":ref["attention_mask"],
        "text_input_ids":text["input_ids"],
        "text_attention_mask":text["attention_mask"],
    }

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text","ref"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

2025-03-04 11:29:54.355947: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-04 11:29:54.782389: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-04 11:29:54.982777: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-04 11:29:54.983327: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-04 11:29:55.329544: I tensorflow/core/platform/cpu_feature_gua

Map:   0%|          | 0/13830 [00:00<?, ? examples/s]

Map:   0%|          | 0/1260 [00:00<?, ? examples/s]

Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

In [8]:
from transformers import DataCollatorWithPadding
from torch.nn.utils.rnn import pad_sequence
import torch

class CustomDataCollator(DataCollatorWithPadding):
    def __call__(self, features):
        ref_ids = [torch.tensor(x['ref_input_ids']) for x in features]
        text_ids = [torch.tensor(x['text_input_ids']) for x in features]
       
        ref_mask = [torch.tensor(x['ref_attention_mask']) for x in features]
        text_mask = [torch.tensor(x['text_attention_mask']) for x in features]
        labels = torch.tensor([x['labels'] for x in features])
        
        ref_ids = pad_sequence(ref_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        text_ids = pad_sequence(text_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        
        ref_mask = pad_sequence(ref_mask, batch_first=True, padding_value=0)
        text_mask = pad_sequence(text_mask, batch_first=True, padding_value=0)
        
        batch = {
            "input_ids": [ref_ids, text_ids],
            "attention_mask": [ref_mask, text_mask],
            "labels": labels
        }
        
        return batch


data_collator = CustomDataCollator(tokenizer=tokenizer)

In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'source', 'model', 'task_type', 'source_id', 'ref_input_ids', 'ref_attention_mask', 'text_input_ids', 'text_attention_mask'],
        num_rows: 13830
    })
    dev: Dataset({
        features: ['labels', 'source', 'model', 'task_type', 'source_id', 'ref_input_ids', 'ref_attention_mask', 'text_input_ids', 'text_attention_mask'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['labels', 'source', 'model', 'task_type', 'source_id', 'ref_input_ids', 'ref_attention_mask', 'text_input_ids', 'text_attention_mask'],
        num_rows: 2700
    })
})

In [10]:
from transformers import AutoModel

base_model = AutoModel.from_pretrained("microsoft/Phi-3.5-mini-instruct")

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
import torch

# 使う装置
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
base_model.to(device)
device

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


device(type='cuda')

In [12]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    logits = logits[0]
    predictions = np.argmax(logits, axis=-1).tolist()  
    labels = labels.tolist()  

    accuracy = accuracy_score(labels, predictions)
    recall = recall_score(labels, predictions)
    precision = precision_score(labels, predictions)
    f1 = f1_score(labels, predictions)

    return {"accuracy": accuracy, "recall": recall, "precision": precision, "f1": f1}

In [13]:
from transformers import TrainingArguments, Trainer
import torch
from models_phi import WithDocModel

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="steps",  
    save_steps=10000,
    learning_rate=1e-6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16 = True,
    gradient_accumulation_steps=12,
    logging_dir="./logs",
    remove_unused_columns=False,
    report_to="tensorboard",
    optim="adafactor",
)

model = WithDocModel(base_model)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
trainer.evaluate()

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
You are not running the flash-attention implementation, expect numerical differences.


{'eval_loss': 0.8278520107269287,
 'eval_model_preparation_time': 0.0033,
 'eval_accuracy': 0.4579365079365079,
 'eval_recall': 0.5813084112149532,
 'eval_precision': 0.4038961038961039,
 'eval_f1': 0.47662835249042146,
 'eval_runtime': 43.7981,
 'eval_samples_per_second': 28.768,
 'eval_steps_per_second': 7.192}

In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,Recall,Precision,F1
0,0.5985,0.529804,0.0033,0.739683,0.656075,0.709091,0.681553
1,0.504,0.506431,0.0033,0.76746,0.557009,0.841808,0.670416
2,0.4415,0.492231,0.0033,0.768254,0.676636,0.752599,0.712598
3,0.3756,0.505804,0.0033,0.754762,0.613084,0.762791,0.679793
4,0.2997,0.538454,0.0033,0.763492,0.631776,0.769932,0.694045
6,0.1534,0.648621,0.0033,0.764286,0.714019,0.726236,0.720075
7,0.104,0.714076,0.0033,0.751587,0.654206,0.732218,0.691017
8,0.0743,0.764679,0.0033,0.744444,0.652336,0.719588,0.684314
9,0.0565,0.794199,0.0033,0.749206,0.659813,0.724846,0.690802


TrainOutput(global_step=2880, training_loss=0.2828547643290626, metrics={'train_runtime': 13808.8554, 'train_samples_per_second': 10.015, 'train_steps_per_second': 0.209, 'total_flos': 0.0, 'train_loss': 0.2828547643290626, 'epoch': 9.994216310005784})

In [16]:
trainer.evaluate(eval_dataset=tokenized_datasets["test"])

{'eval_loss': 0.765934407711029,
 'eval_model_preparation_time': 0.0033,
 'eval_accuracy': 0.7696296296296297,
 'eval_recall': 0.7030752916224814,
 'eval_precision': 0.6597014925373135,
 'eval_f1': 0.6806981519507187,
 'eval_runtime': 85.9609,
 'eval_samples_per_second': 31.41,
 'eval_steps_per_second': 7.852,
 'epoch': 9.994216310005784}

In [17]:
def create_dev_task(name):
    dev_data2 = [d for d in test_data if d["task_type"] == name]
    dev_df2 = pd.DataFrame(dev_data2)
    dev_ds2 = Dataset.from_pandas(dev_df2)
    tokenized_datasets_task = dev_ds2.map(tokenize_function, batched=True)
    tokenized_datasets_task = tokenized_datasets_task.remove_columns(["text","ref"])
    return tokenized_datasets_task

In [18]:
dev_qa = create_dev_task("QA")
trainer.evaluate(eval_dataset=dev_qa)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

{'eval_loss': 0.6666157841682434,
 'eval_model_preparation_time': 0.0033,
 'eval_accuracy': 0.7866666666666666,
 'eval_recall': 0.53125,
 'eval_precision': 0.4207920792079208,
 'eval_f1': 0.4696132596685083,
 'eval_runtime': 26.7115,
 'eval_samples_per_second': 33.693,
 'eval_steps_per_second': 8.423,
 'epoch': 9.994216310005784}

In [19]:
dev_d2t = create_dev_task("Data2txt")
trainer.evaluate(eval_dataset=dev_d2t)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

{'eval_loss': 0.7976372838020325,
 'eval_model_preparation_time': 0.0033,
 'eval_accuracy': 0.7977777777777778,
 'eval_recall': 0.8739205526770294,
 'eval_precision': 0.8227642276422764,
 'eval_f1': 0.847571189279732,
 'eval_runtime': 31.5761,
 'eval_samples_per_second': 28.503,
 'eval_steps_per_second': 7.126,
 'epoch': 9.994216310005784}

In [20]:
dev_sum = create_dev_task("Summary")
trainer.evaluate(eval_dataset=dev_sum)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

{'eval_loss': 0.8335500955581665,
 'eval_model_preparation_time': 0.0033,
 'eval_accuracy': 0.7244444444444444,
 'eval_recall': 0.35294117647058826,
 'eval_precision': 0.3829787234042553,
 'eval_f1': 0.3673469387755102,
 'eval_runtime': 28.1615,
 'eval_samples_per_second': 31.959,
 'eval_steps_per_second': 7.99,
 'epoch': 9.994216310005784}

In [21]:
# トレーニング後にモデルを保存
name = "./trained/with_doc_phi"
trainer.save_model(name)
trainer.save_state()
model.save_pretrained(name)