In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import random_split
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModel, AutoModelWithLMHead, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import pipeline
import matplotlib.pyplot as plt
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
import datetime
import sys 
import seaborn as sns
import os



save_path = r'C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr'
contexts_path = r'C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\contexts'

# setting device as GPU if available
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('Device in use: {} \n'.format(device))

## the model we are using (from Hugging Face)
MODEL_NAME = 'gpt2'

## the percentage of full dataset to be used for training
TRAIN_PERCENTAGE = 0.9

BATCH_SIZE = 1
N_EPOCHS = 100
LOGGING_STEPS = 500
WARMUP_STEPS = 3000 ##roughly 3 epochs of warmup

Device in use: cuda 



In [2]:
def join_contexts():
    ''' This finction joins the contexts 
    from separate files into one array
    '''
    
    contexts = []
    files = os.listdir(contexts_path)
    for file in files:
        current_path = (contexts_path + r'\{}'.format(file))
        current_text = open(current_path, 'r', encoding='utf8').read()
        contexts.append(current_text)
    
    return contexts

contexts = join_contexts()

In [2]:
## importing tokenizer and model (with language modeling head) and setting context length
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
transformer = GPT2LMHeadModel.from_pretrained(MODEL_NAME).to(device)
max_length = tokenizer.model_max_length

## padding token required when tokenizing the text below, but apparently gpt-2 tokenizer doesn't have it?
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

## resizing becasue we added an extra token
transformer.resize_token_embeddings(len(tokenizer))

Using pad_token, but it is not set yet.


Embedding(50258, 768)

In [4]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [6]:
def tokenize(batch):
    return tokenizer(batch['input_ids'], padding='max_length', truncation=True, max_length=max_length) 

In [7]:
def return_HF_datasets(contexts_array):
    ''' This function will take in an array (python list) containg all 
    the contexts, tokenize it, and return the tokenized train and test
    Hugging Face datasets
    '''
    df = pd.DataFrame(contexts_array)
    df.columns = ['input_ids']   # gpt2LMhead expects it to be called this, tokenizer should already name it that?
    full_dataset = Dataset.from_pandas(df)
    tokenized_dataset = full_dataset.map(tokenize, batched=True, batch_size=BATCH_SIZE)
    
    ## splitting into test and train
    train_dataset = tokenized_dataset.select(range(int(TRAIN_PERCENTAGE*len(full_dataset))))
    test_dataset = tokenized_dataset.select(range(int(TRAIN_PERCENTAGE*len(full_dataset)), len(full_dataset)))
                                         
    return train_dataset, test_dataset

train_dataset, test_dataset = return_HF_datasets(contexts)

Map:   0%|          | 0/1040 [00:00<?, ? examples/s]

In [8]:
training_args = TrainingArguments(
    output_dir=save_path + r'\gpt2_finetune_ckpts', 
    overwrite_output_dir=True, 
    num_train_epochs=N_EPOCHS,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE, 
    per_device_eval_batch_size=BATCH_SIZE, 
    load_best_model_at_end=True,
    evaluation_strategy='steps',
    disable_tqdm=False,
    logging_steps=LOGGING_STEPS,
    eval_steps=LOGGING_STEPS,
    save_steps=LOGGING_STEPS,
    warmup_steps=WARMUP_STEPS, # no. of warmup steps till it reaches set value for learning rate (default srat: linear)
    fp16=True,
    report_to='tensorboard')
    

trainer = Trainer(
    model=transformer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)])

In [9]:
trainer.train()

***** Running training *****
  Num examples = 936
  Num Epochs = 100
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 93600
  Number of trainable parameters = 124440576
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,23.7002,5.026304
1000,3.5752,3.431267
1500,3.2332,3.10559
2000,3.035,2.922549
2500,2.8473,2.822305
3000,2.7189,2.753305
3500,2.6072,2.687626
4000,2.5283,2.656788
4500,2.421,2.592282
5000,2.3435,2.574138


***** Running Evaluation *****
  Num examples = 104
  Batch size = 1
Saving model checkpoint to C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\checkpoint-500
Configuration saved in C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\checkpoint-500\config.json
Model weights saved in C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 104
  Batch size = 1
Saving model checkpoint to C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\checkpoint-1000
Configuration saved in C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\checkpoint-1000\config.json
Model weights saved in C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\checkpoint-1000

  Num examples = 104
  Batch size = 1
Saving model checkpoint to C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\checkpoint-8500
Configuration saved in C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\checkpoint-8500\config.json
Model weights saved in C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\checkpoint-8500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 104
  Batch size = 1
Saving model checkpoint to C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\checkpoint-9000
Configuration saved in C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\checkpoint-9000\config.json
Model weights saved in C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\checkpoint-9000\pytorch_model.bin
***** Run

Loading best model from C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\checkpoint-13500 (score: 2.3664803504943848).


TrainOutput(global_step=16000, training_loss=2.7510974502563474, metrics={'train_runtime': 3706.4032, 'train_samples_per_second': 25.254, 'train_steps_per_second': 25.254, 'total_flos': 8361345024000000.0, 'train_loss': 2.7510974502563474, 'epoch': 17.09})

In [10]:
trainer.save_model()

Saving model checkpoint to C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts
Configuration saved in C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\config.json
Model weights saved in C:\Users\M\OneDrive - Carleton University\Documents\my_stuff\Projects\chatDr\gpt2_finetune_ckpts\pytorch_model.bin


In [17]:
ChatDr = pipeline('text-generation', model=(r'.\gpt2_finetune_ckpts'), 
                  tokenizer=tokenizer, device=torch.cuda.current_device())

In [35]:
prompt = "Inguinal Hernia Repair"
outputs = chatDr(prompt, max_length = 250)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [36]:
print (outputs[0]['generated_text'])

Inguinal Hernia Repair, PREOPERATIVE DIAGNOSES,1. Left inguinal hernia.,2. Right inguinal hernia.,POSTOPERATIVE DIAGNOSES,1. Left inguinal hernia.,2. Right inguinal hernia.,OPERATION,2. Injection of Tegaderm.,PROCEDURE,1. Left inguinal hernia.,2. Irrigation and debridement of right inguinal hernia.,ANESTHESIA:,Local MAC.,ESTIMATED BLOOD LOSS:, Less than 50 mL.,GROSS OPERATIVE FINDINGS,Following the administration of sedation and local MAC anesthesia, a longitudinal incision was made in the left inguinal pillar below the level of the hernia sac. The hernia sac was opened up and the incision was dissected out laterally. The luminescent probe was easily able to be inserted into the hernia sac and the hernias were noted to be inflamed. The intraocular lens was inspected. The hernia sac was well perfused. The hernia sac was infiltrated with 0.25% Marcaine without epinephrine. We
