In [None]:
# Install packages
# Put all 'pip install' commands here..

!pip install torch transformers pandas

In [None]:
%%script false --no-raise-error

import torch
from transformers import BertTokenizer, BertForQuestionAnswering

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

device = "cpu"
# Move the model to the GPU
if torch.cuda.is_available():
    device = "cuda"

model.to(device)

print(f"device: {device}")

import numpy as np

question = "What is the capital of France?"
context = "Paris is the capital of France."

inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors='pt')
input_ids = inputs["input_ids"].tolist()[0]
attention_mask = inputs["attention_mask"].tolist()[0]

input_ids = torch.tensor([input_ids]).to(device)
attention_mask = torch.tensor([attention_mask]).to(device)

output = model(input_ids, attention_mask=attention_mask)

answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)

if answer_end >= answer_start:
    answer_tokens = input_ids[0][answer_start:answer_end+1].tolist()
    answer_tokens = tokenizer.convert_ids_to_tokens(answer_tokens)
    answer = tokenizer.convert_tokens_to_string(answer_tokens)
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")

print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))

device: cuda


In [None]:
# Create output paths
import os
os.makedirs('./results', exist_ok = True)
OUTPUT_DIR: str = './results'

In [None]:
# Assign values to few params 
MODEL_NAME = 'EleutherAI/gpt-neo-125M'
BOS_TOKEN = '<|startoftext|>'
EOS_TOKEN = '<|endoftext|>'
PAD_TOKEN = '<|pad|>'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, bos_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN)

model =  AutoModelForCausalLM.from_pretrained(MODEL_NAME).cuda()
model.resize_token_embeddings(len(tokenizer))

DATA_PATH = '/kaggle/input/diagnoise-me/diagnose_en_dataset.feather'

data = pd.read_feather(DATA_PATH)
data = data['Patient'].values

SEQ_LEN = 1024
SAMPLE_SIZE =  int(data.shape[0] * 0.01)
_data = [el[:SEQ_LEN]  for el in data[:SAMPLE_SIZE]]

dataset = PatientDiagnozeDataset(txt_list = _data, tokenizer = tokenizer, max_length = 1024)

TRAIN_SIZE = int(len(dataset) * 0.8)
train_dataset, val_dataset = random_split(dataset, [TRAIN_SIZE, len(dataset) - TRAIN_SIZE])

training_args = TrainingArguments(output_dir = OUTPUT_DIR, num_train_epochs = 2, logging_steps = 5000, 
                                  save_strategy="epoch",
                                  per_device_train_batch_size=2, per_device_eval_batch_size=2, 
                                  warmup_steps=50, weight_decay=0.01, logging_dir='./logs', 
                                  evaluation_strategy="epoch",
                                 load_best_model_at_end=True)

_trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset,
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])})
_trainer.train()

In [None]:
generated = tokenizer(BOS_TOKEN, return_tensors="pt").input_ids.cuda()

In [None]:
sample_outputs = model.generate(generated, do_sample=True, top_k=50,
                                bos_token='<|startoftext|>',
                                eos_token='<|endoftext|>', pad_token='<|pad|>',
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=20)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))