# Environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
!pip install datasets
!pip install islab-opendeid

In [None]:
import copy
import io
import math
import os
import time
import re
import json
import random
import numpy as np
import pandas as pd
from datasets import load_dataset, Features, Value, concatenate_datasets, Dataset
from sklearn.model_selection import train_test_split
import torch
from torch.optim import lr_scheduler, AdamW
from torch.utils.data import DataLoader
from torch.nn import functional as F
from tqdm import tqdm, trange
from tqdm.notebook import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from transformers import get_linear_schedule_with_warmup
from islab.aicup import collate_batch_with_prompt_template, OpenDeidBatchSampler

# Constant

In [None]:
"""Basic Setting"""
data_path = "/content/drive/MyDrive/AIcup/data_hpw"
model_path = "/content/drive/MyDrive/AIcup/model_hpw"

"""Data Setting"""
# TaskPrefix = "Private information extraction from patients' records: "
# TaskPrefix = ""
IgnoredPadIdx = -100
PHINullRatio = 0.5
PhiCategory = ['PATIENT', 'DOCTOR', 'USERNAME', 'PROFESSION',
                'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION',
                'STREET', 'CITY', 'STATE', 'COUNTRY', 'ZIP', 'LOCATION-OTHER',
                'AGE', 'DATE', 'TIME', 'DURATION', 'SET', 'PHONE', 'FAX', 'EMAIL',
                'URL', 'IPADDR', 'SSN', 'MEDICALRECORD', 'HEALTHPLAN', 'ACCOUNT', 'LICENSE',
                'VEHICLE', 'DEVICE', 'BIOID', 'IDNUM']

"""Model Constant Setting"""
BatchSize = 16
Epochs = int(BatchSize*0.75)
LearningRate = 3e-5
WeightDecay = 0.01
PretrainedModel = "EleutherAI/pythia-70m"

# Data Reading for Training

In [None]:
"""Data Reading"""
data_path1 = os.path.join(data_path, "train_phase1_128.tsv")
data_path2 = os.path.join(data_path, "train_phase2_128.tsv")

data1 = load_dataset("csv", data_files=data_path1, delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

data2 = load_dataset("csv", data_files=data_path2, delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

print("pahse1 data:", len(data1["train"]))
print("pahse2 data:", len(data2["train"]))
print()
print("data sample:", data1["train"][200])
print(type(data1["train"]))

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

pahse1 data: 50469
pahse2 data: 27820

data sample: {'fid': '104', 'idx': 5803, 'content': 'F.  Sections show essentially normal breast tissue.    ', 'label': 'PHI:Null'}
<class 'datasets.arrow_dataset.Dataset'>


In [None]:
"""Data Concatenation"""
data = concatenate_datasets([data1['train'], data2['train']])
print("data length:", len(data))
print(data[1025])

# # small scale for test
# data = data.select(range(5000))
# print("test data length:", len(data))

data length: 78289
{'fid': '122', 'idx': 3653, 'content': 'C.  The sections of solar damaged skin show no evidence of malignancy.  There is a small focus of chronic inflammation and fat necrosis in the subcutis, possibly relating to previous biopsy of surgery.  A seborrhoeic keratosis is noted on the skin surface as well as an area of actinic keratosis.', 'label': 'PHI:Null'}


In [None]:
# filter null data
def filter_phi_null_ratio(dataset, ratio):
    # get indices
    phi_null_indices = [i for i, item in enumerate(dataset) if item['label'] == 'PHI:Null']
    other_indices = [i for i, item in enumerate(dataset) if item['label'] != 'PHI:Null']
    print(f"Total {len(dataset)}, PHI null {len(phi_null_indices)}, others {len(other_indices)} ")

    # target null count
    target_phi_null_count = int(min(len(phi_null_indices), len(other_indices) * ratio))
    print("target_phi_null_count", target_phi_null_count)

    if len(phi_null_indices) > target_phi_null_count:
        phi_null_indices = random.sample(phi_null_indices, target_phi_null_count)

    # reunion indices
    final_indices = phi_null_indices + other_indices
    random.shuffle(final_indices)

    final_dataset = dataset.select(final_indices)
    return final_dataset

filtered_data = filter_phi_null_ratio(data, PHINullRatio)

# print filter information
phi_null_count_after = len([item for item in filtered_data if item['label'] == 'PHI:Null'])
other_count_after = len(filtered_data) - phi_null_count_after
phi_null_ratio = phi_null_count_after / other_count_after

print("After filtering:")
print("PHI: NULL count:", phi_null_count_after)
print("Other labels count:", other_count_after)
print("PHI: NULL to Other labels ratio:", phi_null_ratio)

data = filtered_data

Total 78289, PHI null 52001, others 26288 
target_phi_null_count 13144
After filtering:
PHI: NULL count: 13144
Other labels count: 26288
PHI: NULL to Other labels ratio: 0.5


In [None]:
train_data, test_data = data.train_test_split(test_size=0.10, seed=25).values()
print("train data size:", len(train_data))
print("test data size:", len(test_data))

train data size: 35488
test data size: 3944


# Model

In [None]:
bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(PretrainedModel, revision="step3000")
tokenizer.padding_side = 'left'
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"{tokenizer.pad_token}: {tokenizer.pad_token_id}")

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

<|pad|>: 50278


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = AutoConfig.from_pretrained(PretrainedModel,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    output_hidden_states=False)

model = AutoModelForCausalLM.from_pretrained(PretrainedModel, revision="step3000", config=config)
model.resize_token_embeddings(len(tokenizer))
model.to(device)
print(model)

config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [None]:
optimizer = AdamW(model.parameters(), lr=LearningRate, weight_decay=WeightDecay)
# optimizer = AdamW(model.parameters(), lr=LearningRate)
# steps calculation
num_training_steps = len(train_data) // BatchSize * Epochs
CountSteps = int(num_training_steps*0.1/Epochs) // 10 *10
WarmUpSteps = int(num_training_steps*0.01)
print(f"training steps {num_training_steps}, count each {CountSteps} steps, warm up in {WarmUpSteps} ")

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WarmUpSteps,
    num_training_steps=num_training_steps
)

count each 220 steps


# Train Dataloader

In [None]:
train_data = list(train_data)
test_data = list(test_data)

train_dataloader = DataLoader(train_data,
                              batch_sampler=OpenDeidBatchSampler(train_data, BatchSize),
                              collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                              pin_memory=True)

test_dataloader = DataLoader(test_data,
                             batch_sampler=OpenDeidBatchSampler(test_data, 1),
                             collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                             pin_memory=True)

dataloaders = {"train": train_dataloader, "test": test_dataloader}

# Training

In [None]:
def write_losses(loss_path, train_losses, dev_losses):
  losses={}
  losses["train"] = train_losses
  losses["test"] = dev_losses
  with open(loss_path, "w") as out_config:
    json.dump(losses, out_config, indent=4)

In [None]:
"""Train Model"""
# model path
name = str(int(time.time()))
save_path = os.path.join(model_path, name)
model_name = save_path + f"/best_{name}.pt"

loss_path = os.path.join(save_path, "loss.csv")

if not os.path.isdir(save_path):
    os.mkdir(save_path)

best_loss = float('inf')
train_losses = []
test_losses = []

for epoch in range(Epochs):
    print("[Training] Epoch {}/{}".format(epoch, Epochs - 1))
    print("-" * 10)

    running_loss_train = 0.0
    running_loss_test = 0.0
    intermediate_loss = 0.0

    # Training phase
    model.train()
    for count, (seqs, labels, masks) in enumerate(tqdm(dataloaders["train"])):
        seqs, labels, masks = seqs.to(device), labels.to(device), masks.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=seqs, labels=labels, attention_mask=masks)
        # loss = outputs.loss.mean()
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        running_loss_train += loss.item()
        intermediate_loss += loss.item()*BatchSize
        # Print training losses
        if count % CountSteps == 0 and count != 0:
            average_loss = intermediate_loss / CountSteps
            print(f"Loss in epoch{epoch}-step{count}: {average_loss:.4f}")
            train_losses.append(average_loss)
            intermediate_loss = 0

    # Print epoch train losses
    epoch_train_loss = running_loss_train / len(dataloaders["train"])
    print(f"[Training] Train Loss: {epoch_train_loss:.4f}")

    # Testing phase
    model.eval()
    with torch.no_grad():
        for seqs_test, labels_test, masks_test in tqdm(dataloaders["test"]):
            seqs_test, labels_test, masks_test = seqs_test.to(device), labels_test.to(device), masks_test.to(device)
            outputs = model(input_ids=seqs_test, labels=labels_test, attention_mask=masks_test)
            # loss = outputs.loss.mean()
            loss = outputs.loss
            running_loss_test += loss.item()

    # print epoch test loss
    epoch_test_loss = running_loss_test / len(dataloaders["test"])
    print(f"[Training] Test Loss: {epoch_test_loss:.4f}")

    # save the best model
    test_losses.append(epoch_test_loss)


    if epoch_test_loss < best_loss:
        # write losses in each epoch
        write_losses(loss_path, train_losses, test_losses)

        best_loss = epoch_test_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        torch.save(model.state_dict(), model_name)
        print(f"[INFO] Updated best model on dev checkpoint: {model_name}")

[Training] Epoch 0/11
----------


  0%|          | 0/2218 [00:00<?, ?it/s]

Loss in epoch0-step220: 54.1094
Loss in epoch0-step440: 29.9959
Loss in epoch0-step660: 25.1150
Loss in epoch0-step880: 23.4577
Loss in epoch0-step1100: 22.0634
Loss in epoch0-step1320: 21.7139
Loss in epoch0-step1540: 20.3861
Loss in epoch0-step1760: 19.7554
Loss in epoch0-step1980: 19.6118
Loss in epoch0-step2200: 19.0577
[Training] Train Loss: 25.4645


  0%|          | 0/3944 [00:00<?, ?it/s]

[Training] Test Loss: 1.3470
[INFO] Updated best model on dev checkpoint: /content/drive/MyDrive/AIcup/model_hpw/1700311585/best_1700311585.pt
[Training] Epoch 1/11
----------


  0%|          | 0/2218 [00:00<?, ?it/s]

Loss in epoch1-step220: 18.2867
Loss in epoch1-step440: 16.8971
Loss in epoch1-step660: 16.9343
Loss in epoch1-step880: 16.9136
Loss in epoch1-step1100: 15.9735
Loss in epoch1-step1320: 17.2954
Loss in epoch1-step1540: 16.0205
Loss in epoch1-step1760: 15.7192
Loss in epoch1-step1980: 16.1958
Loss in epoch1-step2200: 15.3078
[Training] Train Loss: 16.5318


  0%|          | 0/3944 [00:00<?, ?it/s]

[Training] Test Loss: 1.1622
[INFO] Updated best model on dev checkpoint: /content/drive/MyDrive/AIcup/model_hpw/1700311585/best_1700311585.pt
[Training] Epoch 2/11
----------


  0%|          | 0/2218 [00:00<?, ?it/s]

Loss in epoch2-step220: 14.8842
Loss in epoch2-step440: 14.0554
Loss in epoch2-step660: 13.5974
Loss in epoch2-step880: 13.7868
Loss in epoch2-step1100: 13.6011
Loss in epoch2-step1320: 14.3729
Loss in epoch2-step1540: 13.7353
Loss in epoch2-step1760: 13.5144
Loss in epoch2-step1980: 13.4888
Loss in epoch2-step2200: 13.2786
[Training] Train Loss: 13.8157


  0%|          | 0/3944 [00:00<?, ?it/s]

[Training] Test Loss: 1.0724
[INFO] Updated best model on dev checkpoint: /content/drive/MyDrive/AIcup/model_hpw/1700311585/best_1700311585.pt
[Training] Epoch 3/11
----------


  0%|          | 0/2218 [00:00<?, ?it/s]

Loss in epoch3-step220: 12.8097
Loss in epoch3-step440: 11.9836
Loss in epoch3-step660: 11.8800
Loss in epoch3-step880: 11.8425
Loss in epoch3-step1100: 11.8176
Loss in epoch3-step1320: 12.3051
Loss in epoch3-step1540: 11.7384
Loss in epoch3-step1760: 11.5365
Loss in epoch3-step1980: 11.7076
Loss in epoch3-step2200: 11.7914
[Training] Train Loss: 11.9265


  0%|          | 0/3944 [00:00<?, ?it/s]

[Training] Test Loss: 1.0168
[INFO] Updated best model on dev checkpoint: /content/drive/MyDrive/AIcup/model_hpw/1700311585/best_1700311585.pt
[Training] Epoch 4/11
----------


  0%|          | 0/2218 [00:00<?, ?it/s]

Loss in epoch4-step220: 11.3492
Loss in epoch4-step440: 10.3994
Loss in epoch4-step660: 10.2906
Loss in epoch4-step880: 10.3172
Loss in epoch4-step1100: 10.4733
Loss in epoch4-step1320: 11.0000
Loss in epoch4-step1540: 10.3317
Loss in epoch4-step1760: 10.4207
Loss in epoch4-step1980: 10.2600
Loss in epoch4-step2200: 10.4129
[Training] Train Loss: 10.5141


  0%|          | 0/3944 [00:00<?, ?it/s]

[Training] Test Loss: 0.9748
[INFO] Updated best model on dev checkpoint: /content/drive/MyDrive/AIcup/model_hpw/1700311585/best_1700311585.pt
[Training] Epoch 5/11
----------


  0%|          | 0/2218 [00:00<?, ?it/s]

Loss in epoch5-step220: 9.8071
Loss in epoch5-step440: 9.3419
Loss in epoch5-step660: 9.2259
Loss in epoch5-step880: 9.3194
Loss in epoch5-step1100: 9.4191
Loss in epoch5-step1320: 9.9270
Loss in epoch5-step1540: 9.2320
Loss in epoch5-step1760: 9.1885
Loss in epoch5-step1980: 9.4010
Loss in epoch5-step2200: 9.4361
[Training] Train Loss: 9.4137


  0%|          | 0/3944 [00:00<?, ?it/s]

[Training] Test Loss: 0.9492
[INFO] Updated best model on dev checkpoint: /content/drive/MyDrive/AIcup/model_hpw/1700311585/best_1700311585.pt
[Training] Epoch 6/11
----------


  0%|          | 0/2218 [00:00<?, ?it/s]

Loss in epoch6-step220: 8.9394
Loss in epoch6-step440: 8.4593
Loss in epoch6-step660: 8.3932
Loss in epoch6-step880: 8.4927
Loss in epoch6-step1100: 8.5140
Loss in epoch6-step1320: 8.8384
Loss in epoch6-step1540: 8.4433
Loss in epoch6-step1760: 8.3559
Loss in epoch6-step1980: 8.4213
Loss in epoch6-step2200: 8.4981
[Training] Train Loss: 8.5252


  0%|          | 0/3944 [00:00<?, ?it/s]

[Training] Test Loss: 0.9297
[INFO] Updated best model on dev checkpoint: /content/drive/MyDrive/AIcup/model_hpw/1700311585/best_1700311585.pt
[Training] Epoch 7/11
----------


  0%|          | 0/2218 [00:00<?, ?it/s]

Loss in epoch7-step220: 8.1937
Loss in epoch7-step440: 7.4391
Loss in epoch7-step660: 7.6364
Loss in epoch7-step880: 7.7463
Loss in epoch7-step1100: 8.0507
Loss in epoch7-step1320: 8.1880
Loss in epoch7-step1540: 7.5715
Loss in epoch7-step1760: 7.8468
Loss in epoch7-step1980: 7.6240
Loss in epoch7-step2200: 7.8191
[Training] Train Loss: 7.8031


  0%|          | 0/3944 [00:00<?, ?it/s]

[Training] Test Loss: 0.9186
[INFO] Updated best model on dev checkpoint: /content/drive/MyDrive/AIcup/model_hpw/1700311585/best_1700311585.pt
[Training] Epoch 8/11
----------


  0%|          | 0/2218 [00:00<?, ?it/s]

Loss in epoch8-step220: 7.6764
Loss in epoch8-step440: 6.9973
Loss in epoch8-step660: 7.0972
Loss in epoch8-step880: 7.1354
Loss in epoch8-step1100: 7.3266
Loss in epoch8-step1320: 7.4654
Loss in epoch8-step1540: 7.2290
Loss in epoch8-step1760: 6.9953
Loss in epoch8-step1980: 7.0189
Loss in epoch8-step2200: 7.3321
[Training] Train Loss: 7.2191


  0%|          | 0/3944 [00:00<?, ?it/s]

[Training] Test Loss: 0.9099
[INFO] Updated best model on dev checkpoint: /content/drive/MyDrive/AIcup/model_hpw/1700311585/best_1700311585.pt
[Training] Epoch 9/11
----------


  0%|          | 0/2218 [00:00<?, ?it/s]

Loss in epoch9-step220: 7.0091
Loss in epoch9-step440: 6.5173
Loss in epoch9-step660: 6.6800
Loss in epoch9-step880: 6.5954
Loss in epoch9-step1100: 6.8514
Loss in epoch9-step1320: 7.0788
Loss in epoch9-step1540: 6.7521
Loss in epoch9-step1760: 6.7035
Loss in epoch9-step1980: 6.6986
Loss in epoch9-step2200: 6.7564
[Training] Train Loss: 6.7559


  0%|          | 0/3944 [00:00<?, ?it/s]

[Training] Test Loss: 0.9045
[INFO] Updated best model on dev checkpoint: /content/drive/MyDrive/AIcup/model_hpw/1700311585/best_1700311585.pt
[Training] Epoch 10/11
----------


  0%|          | 0/2218 [00:00<?, ?it/s]

Loss in epoch10-step220: 6.6474
Loss in epoch10-step440: 6.3232
Loss in epoch10-step660: 6.3016
Loss in epoch10-step880: 6.2997
Loss in epoch10-step1100: 6.4520
Loss in epoch10-step1320: 6.5725
Loss in epoch10-step1540: 6.3500
Loss in epoch10-step1760: 6.2892
Loss in epoch10-step1980: 6.3009
Loss in epoch10-step2200: 6.5166
[Training] Train Loss: 6.3985


  0%|          | 0/3944 [00:00<?, ?it/s]

[Training] Test Loss: 0.9012
[INFO] Updated best model on dev checkpoint: /content/drive/MyDrive/AIcup/model_hpw/1700311585/best_1700311585.pt
[Training] Epoch 11/11
----------


  0%|          | 0/2218 [00:00<?, ?it/s]

Loss in epoch11-step220: 6.6162
Loss in epoch11-step440: 5.8818
Loss in epoch11-step660: 5.9633
Loss in epoch11-step880: 5.9504
Loss in epoch11-step1100: 6.4069
Loss in epoch11-step1320: 6.2845
Loss in epoch11-step1540: 6.1314
Loss in epoch11-step1760: 5.9877
Loss in epoch11-step1980: 6.0035
Loss in epoch11-step2200: 6.2756
[Training] Train Loss: 6.1405


  0%|          | 0/3944 [00:00<?, ?it/s]

[Training] Test Loss: 0.9006
[INFO] Updated best model on dev checkpoint: /content/drive/MyDrive/AIcup/model_hpw/1700311585/best_1700311585.pt


# Val Dataloader

In [None]:
val_path = os.path.join(data_path, "valid_phase1_128_repeat.tsv")
data_val = load_dataset("csv", data_files=val_path, delimiter='\t',
                          features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'])
data_val= list(data_val['train'])
# data_val = data_val[:1000]
print("pahse1 validation:", len(data_val))
print("validation sample:", data_val[1])

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

pahse1 validation: 30233
validation sample: {'fid': '1001', 'idx': 24, 'content': '8892062.BPL  Vatterott, Jerrie CLARENCE  Lab No:  88Y20620,88Y20620 Exeter DECEPTION BAY  Northern Territory  6845 ', 'label': None}


# Generation

In [None]:
name = "1700311585"
time = name[:10]
model_name = f"drive/MyDrive/AIcup/model_hpw/{name}/best_{time}.pt"
answer_path = f"drive/MyDrive/AIcup/model_hpw/{name}/answer_{time}.txt"
prediction_path = f"drive/MyDrive/AIcup/model_hpw/{name}/prediction_{time}.txt"
model.load_state_dict(torch.load(model_name))
model = model.to(device)

In [None]:
def aicup_predict(model, tokenizer, input, template = "<|endoftext|> __CONTENT__\n\n####\n\n"):
    seeds = [template.replace("__CONTENT__", data['content']) for data in input]
    sep = tokenizer.sep_token
    eos = tokenizer.eos_token
    pad = tokenizer.pad_token
    pad_idx = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    """Generate text from a trained model."""
    model.eval()
    device = model.device
    texts = tokenizer(seeds, return_tensors = 'pt', padding=True).to(device)
    predictions = []

    with torch.cuda.amp.autocast():
        output_tokens = model.generate(**texts, max_new_tokens=400, pad_token_id = pad_idx,
                                        eos_token_id=tokenizer.convert_tokens_to_ids(eos))
        preds = tokenizer.batch_decode(output_tokens)
        for idx , pred in enumerate(preds):
          if "NULL" in pred:
            continue
          phi_infos = pred[pred.index(sep)+len(sep):].replace(pad, "").replace(eos, "").strip()
          predictions.append(f'{input[idx]["fid"]}\t{input[idx]["idx"]}\t{input[idx]["content"]}\t{phi_infos}')
    return predictions

In [None]:
ValBatchSize = 32
with open(os.path.join(prediction_path), 'w', encoding='utf8') as f_predictions:
    for i in tqdm(range(0, len(data_val), ValBatchSize)):
        with torch.no_grad():
            seeds = data_val[i:i+ValBatchSize]
            predictions = aicup_predict(model, tokenizer, input=seeds)
            for p in predictions:
                f_predictions.write(p + '\n')

  0%|          | 0/945 [00:00<?, ?it/s]