# Environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
!pip install datasets
!pip install openpyxl
!pip install sentencepiece
!pip install islab-opendeid

In [None]:
import copy
import io
import math
import os
import time
import re
import random
import numpy as np
import pandas as pd
from datasets import load_dataset, Features, Value, concatenate_datasets, Dataset
from sklearn.model_selection import train_test_split
import torch
from torch.optim import lr_scheduler, AdamW
from torch.utils.data import DataLoader
from torch.nn import functional as F
from tqdm import tqdm, trange
from tqdm.notebook import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from transformers import get_linear_schedule_with_warmup
from islab.aicup import collate_batch_with_prompt_template, OpenDeidBatchSampler

# Constant

In [None]:
"""Basic Setting"""
data_path = "/content/drive/MyDrive/AIcup/data_hpw"

"""Data Setting"""
phi_category = ['PATIENT', 'DOCTOR', 'USERNAME', 'PROFESSION',
                'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION',
                'STREET', 'CITY', 'STATE', 'COUNTRY', 'ZIP', 'LOCATION-OTHER',
                'AGE', 'DATE', 'TIME', 'DURATION', 'SET', 'PHONE', 'FAX', 'EMAIL',
                'URL', 'IPADDR', 'SSN', 'MEDICALRECORD', 'HEALTHPLAN', 'ACCOUNT', 'LICENSE',
                'VEHICLE', 'DEVICE', 'BIOID', 'IDNUM']

"""Model Constant Setting"""
BatchSize = 8
Epochs = 6
MaxLen = 256
LearningRate = 3e-5
# PretrainedModel = "google/flan-t5-small"
PretrainedModel = "EleutherAI/pythia-70m"

# Data Reading for Training

In [None]:
"""Data Reading"""
data_path1 = os.path.join(data_path, "train_phase1.tsv")
data_path2 = os.path.join(data_path, "train_phase2.tsv")

data1 = load_dataset("csv", data_files=data_path1, delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

data2 = load_dataset("csv", data_files=data_path2, delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

print("pahse1 data:", len(data1["train"]))
print("pahse2 data:", len(data2["train"]))
print()
print("data sample:", data1["train"][200])
print(type(data1["train"]))

In [None]:
"""Data Concatenation"""
data = concatenate_datasets([data1['train'], data2['train']])
print("data length:", len(data))
print(data[1025])

# # small scale for test
# data = data.select(range(5000))
# print("test data length:", len(data))

In [None]:
# filter null data
def filter_phi_null_ratio(dataset, ratio):
    # get indices
    phi_null_indices = [i for i, item in enumerate(dataset) if item['label'] == 'PHI:Null']
    other_indices = [i for i, item in enumerate(dataset) if item['label'] != 'PHI:Null']
    print(f"Total {len(dataset)}, PHI null {len(phi_null_indices)}, others {len(other_indices)} ")

    # target null count
    target_phi_null_count = int(min(len(phi_null_indices), len(other_indices) * ratio))
    print("target_phi_null_count", target_phi_null_count)

    if len(phi_null_indices) > target_phi_null_count:
        phi_null_indices = random.sample(phi_null_indices, target_phi_null_count)

    # reunion indices
    final_indices = phi_null_indices + other_indices
    random.shuffle(final_indices)

    final_dataset = dataset.select(final_indices)
    return final_dataset

filtered_data = filter_phi_null_ratio(data, PHINullRatio)

# print filter information
phi_null_count_after = len([item for item in filtered_data if item['label'] == 'PHI:Null'])
other_count_after = len(filtered_data) - phi_null_count_after
phi_null_ratio = phi_null_count_after / other_count_after

print("After filtering:")
print("PHI: NULL count:", phi_null_count_after)
print("Other labels count:", other_count_after)
print("PHI: NULL to Other labels ratio:", phi_null_ratio)

data = filtered_data

In [None]:
train_data, test_data = data.train_test_split(test_size=0.10, seed=25).values()
print("train data size:", len(train_data))
print("test data size:", len(test_data))

# Tokenizer Config


In [None]:
bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(PretrainedModel, revision="step3000")
tokenizer.padding_side = 'left'
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"{tokenizer.pad_token}: {tokenizer.pad_token_id}")

In [None]:
# # test template
# print(train_data)
# data_list = list(train_data)
# tmp_list = data_list[:2]
# template = "<|endoftext|> __CONTENT__\n\n####\n\n__LABEL__ <|END|>"
# for data in tmp_list:
#   # print(data)
#   texts = template.replace("__LABEL__", data['label']).replace("__CONTENT__", data['content'])
#   print(texts)
#   print()

# Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = AutoConfig.from_pretrained(PretrainedModel,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    output_hidden_states=False)

model = AutoModelForCausalLM.from_pretrained(PretrainedModel, revision="step3000", config=config)
model.resize_token_embeddings(len(tokenizer))
model.to(device)
print(model)

In [None]:
optimizer = AdamW(model.parameters(),lr=LearningRate)

# Train Dataloader

In [None]:
train_data = list(train_data)
test_data = list(test_data)

train_dataloader = DataLoader(train_data,
                              batch_sampler=OpenDeidBatchSampler(train_data, BatchSize),
                              collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                              pin_memory=True)

test_dataloader = DataLoader(test_data,
                             batch_sampler=OpenDeidBatchSampler(test_data, 1),
                             collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                             pin_memory=True)

dataloaders = {"train": train_dataloader, "test": test_dataloader}

In [None]:
# # test train_dataloader
# titer = iter(train_dataloader)
# tks, labels, masks = next(titer)
# print(tks.shape)
# print(tks[0])
# print()
# print(masks.shape)
# print(masks[0])
# print()
# print(labels.shape)
# print(labels[0])
# print()

In [None]:
# titer = iter(test_dataloader)
# tks, labels, masks = next(titer)
# print(tks.shape)
# print(tks[0])
# print()
# print(masks.shape)
# print(masks[0])
# print()
# print(labels.shape)
# print(labels[0])
# print()

# Training

In [None]:
"""Train Model"""

# model path
name = str(int(time.time()))
save_path = os.path.join("drive/MyDrive/AIcup/model", name)
model_name = save_path + f"/best_{name}.pt"

if not os.path.isdir(save_path):
    os.mkdir(save_path)

best_loss = float('inf')
train_losses = []
test_losses = []

for epoch in range(Epochs):
    print("[Training] Epoch {}/{}".format(epoch, Epochs - 1))
    print("-" * 10)

    running_loss_train = 0.0
    running_loss_test = 0.0

    # Training phase
    model.train()
    for count, (seqs, labels, masks) in enumerate(tqdm(dataloaders["train"])):
        seqs, labels, masks = seqs.to(device), labels.to(device), masks.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=seqs, labels=labels, attention_mask=masks)
        loss = outputs.loss.mean()
        loss.backward()
        optimizer.step()

        running_loss_train += loss.item()
        # Print training losses
        if count % 1000 == 0 and count != 0:
            print(f"Loss in epoch{epoch}-step{count}: {loss.item()}")
            train_losses.append(loss.item())

    # Print epoch train losses
    epoch_train_loss = running_loss_train / len(dataloaders["train"])
    print(f"[Training] Train Loss: {epoch_train_loss:.4f}")

    # Testing phase
    model.eval()
    with torch.no_grad():
        for seqs, labels, masks in tqdm(dataloaders["test"]):
            seqs, labels, masks = seqs.to(device), labels.to(device), masks.to(device)
            outputs = model(input_ids=seqs, labels=labels, attention_mask=masks)
            loss = outputs.loss.mean()
            running_loss_test += loss.item()

    # print epoch test loss
    epoch_test_loss = running_loss_test / len(dataloaders["test"])
    print(f"[Training] Test Loss: {epoch_test_loss:.4f}")

    # save the best model
    test_losses.append(epoch_test_loss)
    if epoch_test_loss < best_loss:
        best_loss = epoch_test_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        torch.save(model.state_dict(), model_name)
        print(f"[INFO] Updated best model on dev checkpoint: {model_name}")


# Val Dataloader

In [None]:
val_path = os.path.join(data_path, "valid_phase1.tsv")
data_val = load_dataset("csv", data_files=val_path, delimiter='\t',
                          features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'])
data_val= list(data_val['train'])
print("pahse1 validation:", len(data_val))
print("validation sample:", data_val[1025])

# Generation

In [None]:
name = "1699888370"
model_name = f"drive/MyDrive/AIcup/model/{name}/best_{name}.pt"
answer_path = f"drive/MyDrive/AIcup/model/{name}/answer.txt"

model.load_state_dict(torch.load(model_name))
model = model.to(device)

In [None]:
sentence = "Episode No:  88Y206206L"
phi_value = "88Y206206L"
matches = [(match.start(), match.end()) for match in re.finditer(phi_value, sentence)]
print(matches)

In [None]:
def get_anno_format(sentence , infos , boundary):
    anno_list = []
    lines = infos.split("\n")
    normalize_keys = ['DATE' , "TIME" , "DURATION" , "SET"]
    phi_dict = {}
    for line in lines:
        parts = line.split(":")
        if parts[0] not in phi_category or parts[1] == '':
            continue
        if len(parts) == 2:
            phi_dict[parts[0]] = parts[1].strip()
    for phi_key, phi_value in phi_dict.items():
        normalize_time = None
        if phi_key in normalize_keys:
            if '=>' in phi_value:
                temp_phi_values = phi_value.split('=>')
                phi_value = temp_phi_values[0]
                normalize_time = temp_phi_values[-1]
            else:
                normalize_time = phi_value
        try:
            matches = [(match.start(), match.end()) for match in re.finditer(phi_value, sentence)]
        except:
            continue
        for start, end in matches:
            if start == end:
                continue
            item_dict = {
                        'phi' : phi_key,
                        'st_idx' : start + int(boundary),
                        'ed_idx' : end + int(boundary),
                        'entity' : phi_value,
            }
            if normalize_time is not None:
                item_dict['normalize_time'] = normalize_time
            anno_list.append(item_dict)
    return anno_list

def aicup_predict(model, tokenizer, input, template = "<|endoftext|> __CONTENT__\n\n####\n\n"):
    seeds = [template.replace("__CONTENT__", data['content']) for data in input]
    sep = tokenizer.sep_token
    eos = tokenizer.eos_token
    pad = tokenizer.pad_token
    pad_idx = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    """Generate text from a trained model."""
    model.eval()
    device = model.device
    texts = tokenizer(seeds, return_tensors = 'pt', padding=True).to(device)
    outputs = []
    #return
    with torch.cuda.amp.autocast():
        output_tokens = model.generate(**texts, max_new_tokens=400, pad_token_id = pad_idx,
                                        eos_token_id=tokenizer.convert_tokens_to_ids(eos))
        preds = tokenizer.batch_decode(output_tokens)
        for idx , pred in enumerate(preds):
          if "NULL" in pred:
            continue
          phi_infos = pred[pred.index(sep)+len(sep):].replace(pad, "").replace(eos, "").strip()
          annotations = get_anno_format(input[idx]['content'] , phi_infos , input[idx]['idx'])
          for annotation in annotations:
            if 'normalize_time' in annotation:
              outputs.append(f'{input[idx]["fid"]}\t{annotation["phi"]}\t{annotation["st_idx"]}\t{annotation["ed_idx"]}\t{annotation["entity"]}\t{annotation["normalize_time"]}')
            else:
              outputs.append(f'{input[idx]["fid"]}\t{annotation["phi"]}\t{annotation["st_idx"]}\t{annotation["ed_idx"]}\t{annotation["entity"]}')
    return outputs

In [None]:
ValBatchSize = 32
with open(answer_path,'w',encoding='utf8') as f:
    for i in tqdm(range(0, len(data_val), ValBatchSize)):
        with torch.no_grad():
            seeds = data_val[i:i+ValBatchSize]
            outputs = aicup_predict(model, tokenizer, input=seeds)
            for o in outputs:
                f.write(o)
                f.write('\n')

  0%|          | 0/805 [00:00<?, ?it/s]