## Paralell FineTune PUCPR/GPT2-Bio-PT with Brateca v1.1

Total of Clinical Notes: 2,855,819

based on: https://huggingface.co/docs/transformers/accelerate

In [4]:
!pip install datasets accelerate transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.18.0-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.3/215.3 KB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Coll

In [6]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_scheduler
from torch.utils.data import DataLoader
from accelerate import Accelerator
from tqdm import tqdm

In [58]:
dataset = load_dataset('csv', data_files='B1_ClinicalNote.csv', split='train[:10%]+train[-80%:]')
dataset



Dataset({
    features: ['Hospital_ID', 'Patient_ID', 'Admission_ID', 'Note_Date', 'Note_Text', 'Notetaker_Position'],
    num_rows: 899
})

In [32]:
tokenizer = AutoTokenizer.from_pretrained("pucpr/gpt2-bio-pt")

Downloading (…)okenizer_config.json:   0%|          | 0.00/92.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/850k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/508k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

In [59]:
def tokenize_function(examples):
    return tokenizer(examples["Note_Text"], add_special_tokens=True, truncation=True, max_length=256, padding="max_length")

filter_dataset = dataset.filter(lambda example: not example["Note_Text"] == None)
tokenized_datasets = filter_dataset.map(tokenize_function, batched=True, remove_columns=dataset.features.keys())
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.1)

tokenized_datasets.set_format("torch")

small_train_dataset = tokenized_datasets["train"]
small_eval_dataset = tokenized_datasets["test"]

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

tokenized_datasets

Filter:   0%|          | 0/899 [00:00<?, ? examples/s]

Map:   0%|          | 0/895 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 805
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 90
    })
})

In [60]:
model = AutoModelForCausalLM.from_pretrained("pucpr/gpt2-bio-pt")

Downloading pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [61]:
accelerator = Accelerator()
optimizer = AdamW(model.parameters(), lr=3e-5)

train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
     train_dataloader, eval_dataloader, model, optimizer
)



In [62]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [63]:
progress_bar = tqdm(range(num_training_steps))

  0%|          | 0/303 [00:00<?, ?it/s]

In [64]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        input_ids = batch.pop('input_ids').to(device)
        attention_mask = batch.pop('attention_mask').to(device)
        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=input_ids)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 303/303 [02:53<00:00,  1.98it/s]

In [65]:
prompt = 'O paciente chegou no hospital'
input_tokenized = tokenizer(prompt, return_tensors="pt")
output = model.generate(input_tokenized["input_ids"].to(0))
output_text = tokenizer.decode(output[0].tolist())
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


O paciente chegou no hospital, sendo tratado com sucesso.  A partir de um estudo de caso,
