# Environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets
!pip install islab-opendeid

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting islab-opendeid
  Downloading islab_opendeid-0.0.1.1-py3-none-any.whl (3.0 kB)


In [3]:
import copy
import io
import math
import os
import time
import re
import json
import random
import numpy as np
import pandas as pd
from datasets import load_dataset, Features, Value, concatenate_datasets, Dataset
from sklearn.model_selection import train_test_split
import torch
from torch.optim import lr_scheduler, AdamW
from torch.utils.data import DataLoader
from torch.nn import functional as F
from tqdm import tqdm, trange
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from transformers import get_linear_schedule_with_warmup
from islab.aicup import OpenDeidBatchSampler

# Constant

In [4]:
"""Basic Setting"""
data_path = "/content/drive/MyDrive/AIcup/data_hpw"
model_path = "/content/drive/MyDrive/AIcup/model_hpw"

"""Data Setting"""
IgnoredPadIdx = -100
PHINullRatio = 0.3
PHINull = "PHI:Null"
PhiCategory = ['PATIENT', 'DOCTOR', 'USERNAME', 'PROFESSION',
                'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION',
                'STREET', 'CITY', 'STATE', 'COUNTRY', 'ZIP', 'LOCATION-OTHER',
                'AGE', 'DATE', 'TIME', 'DURATION', 'SET', 'PHONE', 'FAX', 'EMAIL',
                'URL', 'IPADDR', 'SSN', 'MEDICALRECORD', 'HEALTHPLAN', 'ACCOUNT', 'LICENSE',
                'VEHICLE', 'DEVICE', 'BIOID', 'IDNUM']

"""Model Constant Setting"""
BatchSize = 16
Epochs = int(BatchSize*0.75)
MaxLen = 196
LearningRate = 5e-5
WeightDecay = 0.02
PretrainedModel = "EleutherAI/pythia-410m"

# DataReading

In [5]:
"""Filter PHINull"""
def filter_phi_null_ratio(dataset, ratio):
    # get indices
    phi_null_indices = [i for i, item in enumerate(dataset) if item['label'] == 'PHI:Null']
    other_indices = [i for i, item in enumerate(dataset) if item['label'] != 'PHI:Null']
    print(f"Total {len(dataset)}, PHI null {len(phi_null_indices)}, others {len(other_indices)} ")

    # target null count
    target_phi_null_count = int(min(len(phi_null_indices), len(other_indices) * ratio))
    print("target_phi_null_count", target_phi_null_count)

    if len(phi_null_indices) > target_phi_null_count:
        phi_null_indices = random.sample(phi_null_indices, target_phi_null_count)

    # reunion indices
    final_indices = phi_null_indices + other_indices
    random.shuffle(final_indices)

    final_dataset = dataset.select(final_indices)
    return final_dataset

In [6]:
def data_reading(data_path1, data_path2):
  # read data
  data1 = load_dataset("csv", data_files=data_path1, delimiter='\t',
                       features = Features({
                          'fid': Value('string'), 'idx': Value('int64'),
                          'content': Value('string'), 'label': Value('string')}),
                       column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)["train"]
  data2 = load_dataset("csv", data_files=data_path2, delimiter='\t',
                       features = Features({
                          'fid': Value('string'), 'idx': Value('int64'),
                          'content': Value('string'), 'label': Value('string')}),
                       column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)["train"]

  print("pahse1 data:", len(data1))
  print("pahse2 data:", len(data2))
  print("data sample:", data1[200])

  # concatenate data
  data = concatenate_datasets([data1, data2])
  print("data length after cancatenation:", len(data))

  # filter data
  filtered_data = filter_phi_null_ratio(data, PHINullRatio)
  # print filter information
  phi_null_count_after = len([item for item in filtered_data if item['label'] == 'PHI:Null'])
  other_count_after = len(filtered_data) - phi_null_count_after
  phi_null_ratio = phi_null_count_after / other_count_after
  print("After filtering:")
  print("Length after filter:", len(filtered_data))
  print("PHI: NULL count:", phi_null_count_after)
  print("Other labels count:", other_count_after)
  print("PHI: NULL to Other labels ratio:", phi_null_ratio)
  return filtered_data

In [7]:
# Original data paths
data_org_train_path1 = os.path.join(data_path, "train_phase1_v8_original_train.tsv")
data_org_train_path2 = os.path.join(data_path, "train_phase2_v8_original_train.tsv")
data_org_test_path1 = os.path.join(data_path, "train_phase1_v8_original_test.tsv")
data_org_test_path2 = os.path.join(data_path, "train_phase2_v8_original_test.tsv")

# Sliced data paths
data_slc_train_path1 = os.path.join(data_path, "train_phase1_v8_sliced_train.tsv")
data_slc_train_path2 = os.path.join(data_path, "train_phase2_v8_sliced_train.tsv")
data_slc_test_path1 = os.path.join(data_path, "train_phase1_v8_sliced_test.tsv")
data_slc_test_path2 = os.path.join(data_path, "train_phase2_v8_sliced_test.tsv")

# Spliced data paths
data_spl_train_path1 = os.path.join(data_path, "train_phase1_v8_spliced_train.tsv")
data_spl_train_path2 = os.path.join(data_path, "train_phase2_v8_spliced_train.tsv")
data_spl_test_path1 = os.path.join(data_path, "train_phase1_v8_spliced_test.tsv")
data_spl_test_path2 = os.path.join(data_path, "train_phase2_v8_spliced_test.tsv")

# Read datasets
data_org_train = data_reading(data_org_train_path1, data_org_train_path2)
data_org_test = data_reading(data_org_test_path1, data_org_test_path2)

data_slc_train = data_reading(data_slc_train_path1, data_slc_train_path2)
data_slc_test = data_reading(data_slc_test_path1, data_slc_test_path2)

data_spl_train = data_reading(data_spl_train_path1, data_spl_train_path2)
data_spl_test = data_reading(data_spl_test_path1, data_spl_test_path2)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

pahse1 data: 48150
pahse2 data: 28840
data sample: {'fid': '106', 'idx': 228, 'content': 'DR CHANG CORIE ROSKE', 'label': 'DOCTOR:CHANG CORIE ROSKE'}
data length after cancatenation: 76990
Total 76990, PHI null 52871, others 24119 
target_phi_null_count 7235
After filtering:
Length after filter: 31354
PHI: NULL count: 7235
Other labels count: 24119
PHI: NULL to Other labels ratio: 0.29997097723786226


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

pahse1 data: 5458
pahse2 data: 2871
data sample: {'fid': '585', 'idx': 549, 'content': 'MACROSCOPIC:', 'label': 'PHI:Null'}
data length after cancatenation: 8329
Total 8329, PHI null 5816, others 2513 
target_phi_null_count 753
After filtering:
Length after filter: 3266
PHI: NULL count: 753
Other labels count: 2513
PHI: NULL to Other labels ratio: 0.299641862315957


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

pahse1 data: 11913
pahse2 data: 6105
data sample: {'fid': '124', 'idx': 685, 'content': 'B.  Specimen labelled "Right breast lesion; long lateral, medium medial, short superior" consists of a piece of orientated breast tissue with three sutures in situ and a hookwire is seen to insert from the medial/superficial aspect.  The specimen measures ', 'label': 'PHI:Null'}
data length after cancatenation: 18018
Total 18018, PHI null 16121, others 1897 
target_phi_null_count 569
After filtering:
Length after filter: 2466
PHI: NULL count: 569
Other labels count: 1897
PHI: NULL to Other labels ratio: 0.2999472851871376


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

pahse1 data: 1188
pahse2 data: 633
data sample: {'fid': '170', 'idx': 1744, 'content': 'enchyma appears unremarkable.', 'label': 'PHI:Null'}
data length after cancatenation: 1821
Total 1821, PHI null 1726, others 95 
target_phi_null_count 28
After filtering:
Length after filter: 123
PHI: NULL count: 28
Other labels count: 95
PHI: NULL to Other labels ratio: 0.29473684210526313


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

pahse1 data: 33637
pahse2 data: 18878
data sample: {'fid': '109', 'idx': 165, 'content': 'Sex:  F Collected: 4/2/2063 at : ', 'label': 'DATE:4/2/2063'}
data length after cancatenation: 52515
Total 52515, PHI null 24595, others 27920 
target_phi_null_count 8376
After filtering:
Length after filter: 36296
PHI: NULL count: 8376
Other labels count: 27920
PHI: NULL to Other labels ratio: 0.3


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

pahse1 data: 3759
pahse2 data: 1576
data sample: {'fid': 'file4000', 'idx': 4449, 'content': '-NEGATIVE FOR MALIGNANCY. B.  LEFT GROIN NODES &amp;#8211; -NEGATIVE FOR MALIGNANCY. C.  VULVA &amp;#8211; ', 'label': 'PHI:Null'}
data length after cancatenation: 5335
Total 5335, PHI null 2802, others 2533 
target_phi_null_count 759
After filtering:
Length after filter: 3292
PHI: NULL count: 759
Other labels count: 2533
PHI: NULL to Other labels ratio: 0.2996446900908014


In [8]:
train_data = concatenate_datasets([data_org_train, data_slc_train, data_spl_train])
print("total train data:", len(train_data))
train_data = train_data.shuffle(seed=42)
# train_data = train_data.select(range(5000))
print("sampled train data:", len(train_data))
for i in range(100, 103):
  print("train data samples:", train_data[i])

test_data = concatenate_datasets([data_org_test, data_slc_test, data_spl_test])
test_data = test_data.shuffle(seed=42)
print("total test data:", len(test_data))
test_data = test_data.select(range(3000))
print("sampled test data:", len(test_data))
for i in range(100, 103):
  print("test data samples:", test_data[i])

total train data: 70116
sampled train data: 70116
train data samples: {'fid': 'file1709', 'idx': 157, 'content': '8890759.PSH', 'label': 'MEDICALRECORD:8890759.PSH'}
train data samples: {'fid': 'file12595', 'idx': 19, 'content': 'MRN no: 1499379 Site_name: WILCANNIA HEALTH SERVICE Facility_id: 016 Specimen_type: Fresh Tissue ', 'label': 'MEDICALRECORD:1499379++HOSPITAL:WILCANNIA HEALTH SERVICE'}
train data samples: {'fid': '1275', 'idx': 1879, 'content': 'Blocks: 1 - common hepatic duct (CHD) surgical margin; 2 - next to the CHD surgical margin; 3 - CHD next section; 4 - most distal section of cystic duct; 5 & 6 - common bile duct (CBD) serial sections from distal to proximal; 7 - proximal duodenum; 8 - distal duodenum; 9 - shave of pancreatic neck surgical margin; 10 to 15 - serial sections of the pancreas from below the ampulla and proximal  [see attached diagram); 16 to 19 - transverse sections of the neck (false margin),; 20 - gallbladder neck TS; 21 - gallbladder body TS; 22 - gal

# Model

In [9]:
bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(PretrainedModel, revision="step3000")
tokenizer.padding_side = 'left'
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"{tokenizer.pad_token}: {tokenizer.pad_token_id}")

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

<|pad|>: 50278


In [10]:
"""model config"""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = AutoConfig.from_pretrained(PretrainedModel,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    output_hidden_states=False,
                                    attention_dropout=0.7)

model = AutoModelForCausalLM.from_pretrained(PretrainedModel, revision="step3000", config=config)
model.resize_token_embeddings(len(tokenizer))
print(model)

# frozen part of model parameters
for layer in model.gpt_neox.layers[2:18]:
    for param in layer.parameters():
        param.requires_grad = False

for name, param in model.named_parameters():
    if not param.requires_grad:
        print(f"Layer frozen: {name}")

model.to(device)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/911M [00:00<?, ?B/s]

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.7, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.7, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

# Optimizer

In [11]:
optimizer = AdamW(model.parameters(), lr=LearningRate, weight_decay=WeightDecay)
# optimizer_step = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_val, momentum=0.9)
num_training_steps = len(train_data) // BatchSize * Epochs
CountSteps = int(num_training_steps*0.1/Epochs) // 10 * 10 // 4
WarmUpSteps = int(num_training_steps*0.01)
print(f"training steps {num_training_steps}, count each {CountSteps} steps, warm up in {WarmUpSteps} ")

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WarmUpSteps,
    num_training_steps=num_training_steps
)

training steps 52584, count each 217 steps, warm up in 525 


# Train Dataloader

In [12]:
def collate_batch_with_prompt_template(batch, tokenizer, template = "<|endoftext|> __CONTENT__\n\n####\n\n__LABEL__ <|END|>", IGNORED_PAD_IDX = -100):
    """ template: __CONTENT__ and __LABEL__ will be replaced with the content and the corresponding labels."""
    # default template: {bos} {data['content']} {sep}

    texts = [template.replace("__LABEL__", data['label']).replace("__CONTENT__", data['content']) for data in list(batch)]
    encoded_seq = tokenizer(texts, padding=True, truncation=True, max_length=MaxLen)

    indexed_tks = torch.tensor(encoded_seq['input_ids'])
    attention_mask = torch.tensor(encoded_seq['attention_mask'])
    encoded_label = torch.tensor(encoded_seq['input_ids'])
    encoded_label[encoded_label == tokenizer.pad_token_id] = IGNORED_PAD_IDX

    return indexed_tks, encoded_label, attention_mask

In [13]:
train_data = list(train_data)
test_data = list(test_data)

train_dataloader = DataLoader(train_data,
                              batch_sampler=OpenDeidBatchSampler(train_data, BatchSize),
                              collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                              pin_memory=True)

test_dataloader = DataLoader(test_data,
                             batch_sampler=OpenDeidBatchSampler(test_data, 1),
                             collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                             pin_memory=True)

dataloaders = {"train": train_dataloader, "test": test_dataloader}

# Training

In [14]:
def write_losses(loss_path, train_losses, dev_losses):
  losses={}
  losses["train"] = train_losses
  losses["test"] = dev_losses
  with open(loss_path, "w") as out_config:
    json.dump(losses, out_config, indent=4)

In [None]:
"""Train Model"""
# model path
name = str(int(time.time()))
save_path = os.path.join(model_path, name)
model_name = save_path + f"/best_{name}.pt"

loss_path = os.path.join(save_path, "loss.csv")

if not os.path.isdir(save_path):
    os.mkdir(save_path)

best_loss = float('inf')
train_losses = []
test_losses = []

for epoch in range(Epochs):
    print("[Training] Epoch {}/{}".format(epoch, Epochs - 1))
    print("-" * 10)

    running_loss_train = 0.0
    running_loss_test = 0.0
    intermediate_loss = 0.0

    # Training phase
    model.train()
    for count, (seqs, labels, masks) in enumerate(tqdm(dataloaders["train"])):
        seqs, labels, masks = seqs.to(device), labels.to(device), masks.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=seqs, labels=labels, attention_mask=masks)
        # loss = outputs.loss.mean()
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        running_loss_train += loss.item()
        intermediate_loss += loss.item()
        # Print training losses
        if count % CountSteps == 0 and count != 0:
            average_loss = intermediate_loss / CountSteps
            print(f"Loss in epoch{epoch}-step{count}: {average_loss:.4f}")
            train_losses.append(average_loss)
            intermediate_loss = 0

    # Print epoch train losses
    epoch_train_loss = running_loss_train / len(dataloaders["train"])
    print(f"[Training] Train Loss: {epoch_train_loss:.4f}")

    # Testing phase
    model.eval()
    with torch.no_grad():
        for seqs_test, labels_test, masks_test in tqdm(dataloaders["test"]):
            seqs_test, labels_test, masks_test = seqs_test.to(device), labels_test.to(device), masks_test.to(device)
            outputs = model(input_ids=seqs_test, labels=labels_test, attention_mask=masks_test)
            # loss = outputs.loss.mean()
            loss = outputs.loss
            running_loss_test += loss.item()

    # print epoch test loss
    epoch_test_loss = running_loss_test / len(dataloaders["test"])
    print(f"[Training] Test Loss: {epoch_test_loss:.4f}")

    # save the best model
    test_losses.append(epoch_test_loss)


    if epoch_test_loss < best_loss:
        # write losses in each epoch
        write_losses(loss_path, train_losses, test_losses)

        best_loss = epoch_test_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        torch.save(model.state_dict(), model_name)
        print(f"[INFO] Updated best model on dev checkpoint: {model_name}")

[Training] Epoch 0/5
----------


  0%|          | 0/8765 [00:00<?, ?it/s]

Loss in epoch0-step217: 5.3912
Loss in epoch0-step434: 3.3467
Loss in epoch0-step651: 2.5899
Loss in epoch0-step868: 2.2531
Loss in epoch0-step1085: 2.0542
Loss in epoch0-step1302: 1.9396
Loss in epoch0-step1519: 1.8289
Loss in epoch0-step1736: 1.7076
Loss in epoch0-step1953: 1.6696
Loss in epoch0-step2170: 1.6238
Loss in epoch0-step2387: 1.5701
Loss in epoch0-step2604: 1.5607
Loss in epoch0-step2821: 1.4839
Loss in epoch0-step3038: 1.4585
Loss in epoch0-step3255: 1.4141
Loss in epoch0-step3472: 1.3929
Loss in epoch0-step3689: 1.3523
Loss in epoch0-step3906: 1.3661
Loss in epoch0-step4123: 1.3450
Loss in epoch0-step4340: 1.3118
Loss in epoch0-step4557: 1.2588
Loss in epoch0-step4774: 1.2871
Loss in epoch0-step4991: 1.2982
Loss in epoch0-step5208: 1.2435
Loss in epoch0-step5425: 1.2343
Loss in epoch0-step5642: 1.2064
Loss in epoch0-step5859: 1.2422
Loss in epoch0-step6076: 1.2407
Loss in epoch0-step6293: 1.2204
Loss in epoch0-step6510: 1.1843
Loss in epoch0-step6727: 1.1716
Loss in epoc

  0%|          | 0/3000 [00:00<?, ?it/s]

[Training] Test Loss: 1.6118
[INFO] Updated best model on dev checkpoint: /content/drive/MyDrive/AIcup/model_hpw/1701420946/best_1701420946.pt
[Training] Epoch 1/5
----------


  0%|          | 0/8765 [00:00<?, ?it/s]

# DataloaderVal

In [None]:
def reading_validation_data(val_path):
  # read data
  val_data = load_dataset("csv", data_files=val_path, delimiter='\t',
                        features = Features({
                            'fid': Value('string'), 'idx': Value('int64'),
                            'content': Value('string'), 'label': Value('string')}),
                        column_names=['fid', 'idx', 'content', 'label'])["train"]

  print(len(val_data))
  print("data sample:", val_data[20])
  return val_data

In [None]:
"""Get validation datasets"""
val_path_org = os.path.join(data_path, "valid_phase1_v8_original.tsv")
val_path_slc = os.path.join(data_path, "valid_phase1_v8_sliced.tsv")
val_path_spl = os.path.join(data_path, "valid_phase1_v8_spliced.tsv")

val_data_org = reading_validation_data(val_path_org)
val_data_slc = reading_validation_data(val_path_slc)
val_data_spl = reading_validation_data(val_path_spl)

# Concatenate datasets
val_data = concatenate_datasets([val_data_org, val_data_slc, val_data_spl])
data_val = list(val_data)
# val_data = list(val_data)[:100]
print("validation length:", len(val_data))
for i in range(100, 103):
  print(val_data[i])

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

25739
data sample: {'fid': '1001', 'idx': 554, 'content': 'MACROSCOPIC: ', 'label': None}


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

6155
data sample: {'fid': '1003', 'idx': 2094, 'content': 'There are no mucinous material present, however there is a white nodule 10mm in diameter.  Also in the bucket are numerous pieces of irregularly shaped fatty tissue with an aggregate of 100x100x20-30mm.  Mucin is not present.', 'label': None}


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

21079
data sample: {'fid': '1001', 'idx': 537, 'content': 'PR, ER and HER2. MACROSCOPIC: ', 'label': None}
validation length: 52973
{'fid': '1003', 'idx': 738, 'content': "A.  'OMENTUM, RIGHT COLON, DIAPHRAGM, PELVIC STRIP, UMBILICUS + PERITONEAL DISEASE'.  Multiple pieces of tissue including omentum, a piece of bowel within anastomosis, a further piece of bowel, strips or peritoneum or diaphragm.  Multiple pieces of fatty tissue and the umbilicus with underlying fatty tissue.  The peritoneum is approximately 300x190x20-30mm.  There is mucinous material on the surface forming numerous patches.  There are no solid nodular areas.  There are two pieces of bowel, the first is 50mm in length and 50mm in circumference. The serosal surface is unremarkable. The wall contains an area of haemorrhage 5mm in thickness and the mucosa is unremarkable. The second piece of bowel is V shaped with a central anastomosis. The two pieces of bowel are 60 and 65mm in length respectively and has a circumferen

# Generation

In [None]:
name = "1701181213"
time = name[:10]
model_name = f"drive/MyDrive/AIcup/model_hpw/{name}/best_{time}.pt"
answer_path = f"drive/MyDrive/AIcup/model_hpw/{name}/answer_{time}.txt"
prediction_path = f"drive/MyDrive/AIcup/model_hpw/{name}/prediction_{time}.txt"
model.load_state_dict(torch.load(model_name))
model = model.to(device)

In [None]:
def aicup_predict(model, tokenizer, input, template = "<|endoftext|> __CONTENT__\n\n####\n\n"):
    seeds = [template.replace("__CONTENT__", data['content']) for data in input]
    sep = tokenizer.sep_token
    eos = tokenizer.eos_token
    pad = tokenizer.pad_token
    pad_idx = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    """Generate text from a trained model."""
    model.eval()
    device = model.device
    texts = tokenizer(seeds, return_tensors = 'pt', padding=True, truncation=True, max_length=MaxLen).to(device)
    predictions = []

    with torch.cuda.amp.autocast():
        output_tokens = model.generate(**texts, max_new_tokens=MaxLen, pad_token_id = pad_idx,
                                        eos_token_id=tokenizer.convert_tokens_to_ids(eos))
        preds = tokenizer.batch_decode(output_tokens)
        for idx , pred in enumerate(preds):
          if "NULL" in pred:
            continue
          if sep not in pred:
            continue
          phi_infos = pred[pred.index(sep)+len(sep):].replace(pad, "").replace(eos, "").strip()
          predictions.append(f'{input[idx]["fid"]}\t{input[idx]["idx"]}\t{input[idx]["content"]}\t{phi_infos}')
    return predictions

In [None]:
ValBatchSize = 16
with open(os.path.join(prediction_path), 'w', encoding='utf8') as f_predictions:
    for i in tqdm(range(0, len(data_val), ValBatchSize)):
        with torch.no_grad():
            seeds = data_val[i:i+ValBatchSize]
            predictions = aicup_predict(model, tokenizer, input=seeds)
            for p in predictions:
                f_predictions.write(p + '\n')

  0%|          | 0/3311 [00:00<?, ?it/s]