# Environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install peft
!pip install accelerate

In [None]:
import copy
import io
import json
import math
import os
import time
import re
import random
import numpy as np
import pandas as pd
from datasets import load_dataset, Features, Value, concatenate_datasets, Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType, PeftModel
from sklearn.model_selection import train_test_split
import torch
from torch.optim import lr_scheduler, AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm, trange
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import get_linear_schedule_with_warmup

# Constant

In [None]:
"""Basic Setting"""
data_path = "/content/drive/MyDrive/AIcup/data_hpw"
model_path = "/content/drive/MyDrive/AIcup/model_hpw"

"""Data Setting"""
TaskPrefix = "Time normalization: "
PHINull = "PHI:Null"
# TaskPrefix = ""
IgnoredPadIdx = -100
PhiCategory = ['PATIENT', 'DOCTOR', 'USERNAME', 'PROFESSION',
                'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION',
                'STREET', 'CITY', 'STATE', 'COUNTRY', 'ZIP', 'LOCATION-OTHER',
                'AGE', 'DATE', 'TIME', 'DURATION', 'SET', 'PHONE', 'FAX', 'EMAIL',
                'URL', 'IPADDR', 'SSN', 'MEDICALRECORD', 'HEALTHPLAN', 'ACCOUNT', 'LICENSE',
                'VEHICLE', 'DEVICE', 'BIOID', 'IDNUM']
NormCategory = ['DATE', 'TIME', 'DURATION', 'SET']
"""Model Constant Setting"""
BatchSize = 64
Epochs = int(BatchSize*0.75)
MaxLen = 32
LearningRate = 3e-5
WeightDecay = 0.01
PretrainedModel = "google/flan-t5-base"

# DatasetTrain

In [None]:
"""Data Reading"""
data_path1 = os.path.join(data_path, "train_phase1_v3_task2.tsv")
data_path2 = os.path.join(data_path, "train_phase2_v3_task2.tsv")

data1 = load_dataset("csv", data_files=data_path1, delimiter='\t',
                     features = Features({
                        'content': Value('string'), 'label': Value('string')}),
                     column_names=['content', 'label'], keep_default_na=False)

data2 = load_dataset("csv", data_files=data_path2, delimiter='\t',
                     features = Features({
                        'content': Value('string'), 'label': Value('string')}),
                     column_names=['content', 'label'], keep_default_na=False)

print("pahse1 data:", len(data1["train"]))
print("pahse2 data:", len(data2["train"]))
print()
print("data sample:", data1["train"][200])
print(type(data1["train"]))

In [None]:
"""Data Concatenation"""
data = concatenate_datasets([data1['train'], data2['train']])
print("data length:", len(data))
print(data[1025])

# """small scale for test"""
# data = data.select(range(90000))
# print("test data length:", len(data))

In [None]:
train_data, test_data = data.train_test_split(test_size=0.10, seed=25).values()
print("train data size:", len(train_data))
print("test data size:", len(test_data))

# ModelConfig

In [None]:
"""Model Config"""
# tokenizer
tokenizer = T5Tokenizer.from_pretrained(PretrainedModel)

# model config
model = T5ForConditionalGeneration.from_pretrained(PretrainedModel)
# model = T5ForConditionalGeneration.from_pretrained(PretrainedModel, load_in_8bit=True)

# LoRA config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)

# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# DataloaderTrain

In [None]:
"""Tokenizer Template"""
def collate_batch_with_prompt_template(batch, tokenizer, IGNORED_PAD_IDX=IgnoredPadIdx):
    texts = [TaskPrefix + data['content'] for data in batch]
    encoded_seq = tokenizer(texts, padding="max_length", truncation=True, max_length=MaxLen, return_tensors="pt")

    labels = [data['label'] for data in batch]
    encoded_label = tokenizer(labels, padding="max_length", truncation=True, max_length=MaxLen, return_tensors="pt")['input_ids']
    encoded_label[encoded_label == tokenizer.pad_token_id] = IGNORED_PAD_IDX

    return encoded_seq['input_ids'], encoded_seq['attention_mask'], encoded_label

In [None]:
"""Dataloader"""
# train dataloader
train_dataloader = DataLoader(train_data,
                              batch_size=BatchSize,
                              collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                              shuffle=True,
                              drop_last=True)
# dev dataloader
test_dataloader = DataLoader(test_data,
                             batch_size=1,
                             collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                             shuffle=False,
                             drop_last=True)

dataloaders = {"train": train_dataloader, "test": test_dataloader}
print(len(train_dataloader))
print(len(test_dataloader))

In [None]:
"""Test Train_dataloader"""
titer = iter(train_dataloader)
tks, masks, labels = next(titer)
print(tks.shape)
print(tks[0])
print()
print(masks.shape)
print(masks[0])
print()
print(labels.shape)
print(labels[0])
print()

# Optimizer

In [None]:
"""optimizer config"""
optimizer = AdamW(model.parameters(), lr=LearningRate, weight_decay=WeightDecay)

# steps calculation
num_training_steps = len(dataloaders["train"])*Epochs
CountSteps = int(num_training_steps*0.1/Epochs) // 10 *10
if CountSteps == 0:
    CountSteps = 10
# warm up
WarmUpSteps = int(num_training_steps*0.01)
print(f"num_training_steps {num_training_steps}, warn_up_steps {WarmUpSteps}, count each {CountSteps} steps")

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WarmUpSteps,
    num_training_steps=num_training_steps
)

# Training

In [None]:
def write_losses(loss_path, train_losses, dev_losses):
  losses={}
  losses["train"] = train_losses
  losses["test"] = dev_losses
  with open(loss_path, "w") as out_config:
    json.dump(losses, out_config, indent=4)

In [None]:
"""Train Model"""
# model path
name = str(int(time.time()))
save_path = os.path.join(model_path, name)
model_name = save_path + f"/best_{name}.pt"

loss_path = os.path.join(save_path, "loss.csv")

if not os.path.isdir(save_path):
    os.mkdir(save_path)

best_loss = float('inf')
train_losses = []
test_losses = []

for epoch in range(Epochs):
    print("[Training] Epoch {}/{}".format(epoch, Epochs - 1))
    print("-" * 10)

    running_loss_train = 0.0
    running_loss_test = 0.0
    intermediate_loss = 0.0

    # Training phase
    model.train()
    for count, (seqs, masks, labels) in enumerate(tqdm(dataloaders["train"])):
        seqs, labels, masks = seqs.to(device), labels.to(device), masks.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=seqs, labels=labels, attention_mask=masks)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        running_loss_train += loss.item()
        intermediate_loss += loss.item()
        # Print training losses
        if count % CountSteps == 0 and count != 0:
            average_loss = intermediate_loss / CountSteps
            print(f"Loss in epoch{epoch}-step{count}: {average_loss:.4f}")
            train_losses.append(average_loss)
            intermediate_loss = 0

    # Print epoch train losses
    epoch_train_loss = running_loss_train / len(dataloaders["train"])
    print(f"[Training] Train Loss: {epoch_train_loss:.4f}")

    # Testing phase
    model.eval()
    with torch.no_grad():
        for seqs_test, masks_test, labels_test in tqdm(dataloaders["test"]):
            seqs_test, labels_test, masks_test = seqs_test.to(device), labels_test.to(device), masks_test.to(device)
            outputs = model(input_ids=seqs_test, labels=labels_test, attention_mask=masks_test)
            # loss = outputs.loss.mean()
            loss = outputs.loss
            running_loss_test += loss.item()

    # print epoch test loss
    epoch_test_loss = running_loss_test / len(dataloaders["test"])
    print(f"[Training] Test Loss: {epoch_test_loss:.4f}")

    # save the best model
    test_losses.append(epoch_test_loss)


    if epoch_test_loss < best_loss:
        # write losses in each epoch
        write_losses(loss_path, train_losses, test_losses)

        best_loss = epoch_test_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        # torch.save(model.state_dict(), model_name)
        torch.save(model.state_dict(), model_name)
        print(f"[INFO] Updated best model on dev checkpoint: {model_name}")

# DataloaderVal

In [None]:
"""get validation dataset"""
answer_name = "answer_1701317300_1701181213.txt"
val_path = os.path.join(data_path,answer_name )
val_data = load_dataset("csv", data_files=val_path, delimiter='\t',
                        features=Features({
                            'fid': Value('string'),
                            'cat': Value('string'),
                            'start_pos': Value('int16'),
                            'end_pos': Value('int16'),
                            'content': Value('string')
                        }),
                        column_names=['fid', 'cat', 'start_pos', 'end_pos', 'content'])["train"]

val_data= list(val_data)
# val_data = val_data[:100]
print(val_data)

In [None]:
"""Tokenizer Template"""
def collate_batch_with_prompt_template_val(batch, tokenizer, IGNORED_PAD_IDX=IgnoredPadIdx):
    texts = [TaskPrefix + data['cat'] + ":" + data['label'] for data in batch]
    fids = [data['fid'] for data in batch]
    cats = [data['cat'] for data in batch]
    sposs = [data['spos'] for data in batch]
    eposs = [data['epos'] for data in batch]
    labels = [data['label'] for data in batch]
    encoded_seq = tokenizer(texts, padding="max_length", truncation=True, max_length=MaxLen, return_tensors="pt")
    return encoded_seq['input_ids'], encoded_seq['attention_mask'], fids, cats, sposs, eposs, labels

In [None]:
ValBatchSize = 1
val_dataloader = DataLoader(val_data,
                            batch_size=ValBatchSize,
                            collate_fn=lambda batch: collate_batch_with_prompt_template_val(batch, tokenizer),
                            shuffle=False)

# Generation


In [None]:
"""model reload"""
name = "1701339576_flant5_v1_base_task2"
time = name[:10]
model_name = f"drive/MyDrive/AIcup/model_hpw/{name}/best_{time}.pt"
prediction_path = val_path.replace(answer_name, answer_name.replace(".txt", "_norm.txt"))
model.load_state_dict(torch.load(model_name))
model = model.to(device)

print(answer_path)
print(prediction_path)

In [None]:
with open(os.path.join(prediction_path), 'w', encoding='utf8') as f_predictions:
  for data in tqdm(val_data):
    fid = data['fid']
    cat = data['cat']
    spo = data['start_pos']
    epo = data['end_pos']
    content = data['content']

    if cat and content:
      texts = TaskPrefix + cat + ":" + content
    else:
      print("[ERROR] No cat or content")
      continue

    encoded_seq = tokenizer(texts, padding="max_length", truncation=True, max_length=MaxLen, return_tensors="pt")
    seqs = encoded_seq['input_ids']
    masks = encoded_seq['attention_mask']

    with torch.no_grad():
      seqs, masks = seqs.to(device), masks.to(device)
      if cat in NormCategory:
        predicted_token = model.generate(input_ids=seqs, attention_mask=masks)
        predicted_ids = predicted_token[0].tolist()
        predicted_string = tokenizer.decode(predicted_ids, skip_special_tokens=True)
        # print(f'{fid}\t{cat}\t{spo}\t{epo}\t{content}\t{predicted_string}\n')
        if predicted_string:
          f_predictions.write(f'{fid}\t{cat}\t{spo}\t{epo}\t{content}\t{predicted_string}\n')
        else:
          f_predictions.write(f'{fid}\t{cat}\t{spo}\t{epo}\t{content}\n')
      else:
        f_predictions.write(f'{fid}\t{cat}\t{spo}\t{epo}\t{content}\n')