# Fine-tuning GPT-2 on a Longevity Publication abstracts dataset in PyTorch

In [6]:
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2Model
import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
import plotly.express as px
import plotly.io as pio
import pandas as pd
import math
import os
from sklearn import tree, metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pio.renderers.default = 'notebook_connected'



Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [4]:
model = model.to(device)

Metal device set to: Apple M2

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2023-02-04 23:23:36.468627: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-02-04 23:23:36.469084: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
# Function to first select topN tokens from the probability list and then based on the selected N word distribution
# get random token ID

In [5]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [9]:
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import csv



In [46]:
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import csv
import transformers

class PubMedLoader(Dataset):
    def __init__(self, dataframe):
        super().__init__()
        self.dataframe = dataframe


        self.abstract_list = []
        self.end_of_text_token = "<|endoftext|>"
        dataset = dataframe[['abstract']]
        for row in dataset:
            abstract_str = f"ABSTRACT:{row[1]}{self.end_of_text_token}"
            self.abstract_list.append(abstract_str)
        
    def __len__(self):
        return len(self.abstract_list)

    def __getitem__(self, item):
        return self.abstract_list[item]

In [38]:
from torch.utils.data import Dataset

class PandasDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        return self.dataframe.iloc[index]

In [47]:
import pandas as pd

df= pd.read_excel('/Users/paritoshmacmini/Desktop/personal_projects/antiaging_app/fine_tuneGPT_forsummary_qna/final_database_of_papers.xlsx',index_col=0)
df = df[['abstract']]
dataset = PubMedLoader(df)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

# Hyperparameters
I tested many(more than 5) hyperparameter sets till I found one that works the best. I mostly tuned *BATCH_SIZE* (in this case, it's the number of forward-backward passes between each optimization step), *EOPOCHS, and LEARNING_RATE*.

For a parameter value starting point for fine-tuning, I inspired from this and this huggingface fine-tuning code.

In [48]:
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 400
from transformers import AdamW
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps = -1)

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [49]:
import transformers as transformers
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS,num_training_steps=5, last_epoch = -1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0


In [50]:
tmp_jokes_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)


In [52]:
for epoch in range(EPOCHS):
    
    print(f"EPOCH {epoch} started" + '=' * 30)
    
    for idx,abstract in enumerate(dataloader):
        
        #################### "Fit as many joke sequences into MAX_SEQ_LEN sequence as possible" logic start ####
        joke_tens = torch.tensor(tokenizer.encode(abstract[0])).unsqueeze(0).to(device)
        #Skip sample from dataset if it is longer than MAX_SEQ_LEN
        if joke_tens.size()[1] > MAX_SEQ_LEN:
            continue
        
        #The first joke sequence in the sequence
        if not torch.is_tensor(tmp_jokes_tens):
            tmp_jokes_tens = joke_tens
            continue
        else:
            #The next joke does not fit in so we process the sequence and leave the last joke 
            #as the start for next sequence 
            if tmp_jokes_tens.size()[1] + joke_tens.size()[1] > MAX_SEQ_LEN:
                work_jokes_tens = tmp_jokes_tens
                tmp_jokes_tens = joke_tens
            else:
                #Add the joke to sequence, continue and try to add more
                tmp_jokes_tens = torch.cat([tmp_jokes_tens, joke_tens[:,1:]], dim=1)
                continue
        ################## Sequence ready, process it trough the model ##################
            
        outputs = model(work_jokes_tens, labels=work_jokes_tens)
        loss, logits = outputs[:2]                        
        loss.backward()
        sum_loss = sum_loss + loss.detach().data
                       
        proc_seq_count = proc_seq_count + 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == 100:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0
    
    # Store the model after each epoch to compare the performance of them
    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_joker_{epoch}.pt"))



In [55]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

In [65]:
%pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [61]:
import logging
logging.getLogger().setLevel(logging.CRITICAL)

import torch
import numpy as np

from transformers import GPT2Tokenizer, GPT2LMHeadModel

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [63]:
# Function to first select topN tokens from the probability list and then based on the selected N word distribution
# get random token ID
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [8]:
def generate_some_text(input_str, text_len = 250):

    cur_ids = torch.tensor(tokenizer.encode(input_str)).unsqueeze(0).long().to(device)

    model.eval()
    with torch.no_grad():

        for i in range(text_len):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(only one) batch and the last predicted embedding
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=10) #Randomly(from the given probability distribution) choose the next word from the top n words
            cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word

        output_list = list(cur_ids.squeeze().to('cpu').numpy())
        output_text = tokenizer.decode(output_list)
        print(output_text)

In [72]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

In [67]:
generate_some_text("The Matrix is everywhere. It is all around us. Even now, in this very room. You can see it when you look out your window or when you turn on your television. You can feel it when you go to work… when you go to church… when you pay your taxes. It is the world that has been pulled over your eyes to blind you from the truth…*")

The Matrix is everywhere. It is all around us. Even now, in this very room. You can see it when you look out your window or when you turn on your television. You can feel it when you go to work… when you go to church… when you pay your taxes. It is the world that has been pulled over your eyes to blind you from the truth…*cringe* It's so real. But what is real? What can you see that will give you insight into it? The Matrix is like a computerized version of the Matrix that is used by the elite to manipulate and deceive the masses into accepting and accepting their false religion. You can read about this in detail in my recent book, The Matrix Conspiracy.

What I have been trying to convey in this article is that the Matrix has a very real, very violent, very powerful, but also very mysterious purpose.

The purpose of the Matrix, as I have outlined in this article, is to create a totalitarian state that can be used to enslave, destroy, subjugate and destroy any individual, group or nati

In [73]:
generate_some_text(" The Godfather: \"I'm going to make him an offer he can't refuse.\" ")

 The Godfather: "I'm going to make him an offer he can't refuse."  He said "I don't care. I want you to go home and be a good husband to your wife."  "Oh?"  he asked.  "No, no, no,  you have to do it."  And he said  "I don't know what you want me to do, but I'll do it,"  so that's the last time I'm gonna say that to him and I can't believe it.  So, I'm not going to talk about that one.  I don't have any of those.  You don't have to talk about that one, either.  The thing that I think you've all been wondering about is why is that?  It's because, in the end, you're going to do it, so that was the thing that really kept coming up for me.  But I've had to put my own spin and spin on it, because I think I'm the best person to explain to you why it is that you're going to do it and why it's so great.  It's all about the relationship.  It's all about that.  It's not about the


# alternative: use the huggingface library

In [5]:
import collections
import logging
import math
import os
import warnings
from timeit import default_timer as timer

import wandb
from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    LineByLineTextDataset,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
)

warnings.filterwarnings("ignore")


start = "<|startoftext|>"
sep = "<|sep|>"


def dict2obj(d):
    """Convert a dictionary to a class"""
    if isinstance(d, list):
        d = [dict2obj(x) for x in d]
    if not isinstance(d, dict):
        return d

    class Class:
        pass

    obj = Class()
    for k in d:
        obj.__dict__[k] = dict2obj(d[k])
    return obj


def get_dataset(args, tokenizer, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size
        )
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            overwrite_cache=args.overwrite_cache,
        )


# Logging
logger = logging.getLogger(__name__)
# Model classes
MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

# These arguments could have been handled by CLI, but I put them in this
# way to make the code simpler.

# Model arguments
model_args = collections.defaultdict(
    config_name="gpt2",
    model_name_or_path="gpt2-large",
    model_type="gpt2",
    tokenizer_name="gpt2",
    cache_dir=None,
)
# Data arguments
data_args = collections.defaultdict(
    train_data_file="/Users/paritoshmacmini/Desktop/personal_projects/antiaging_app/fine_tuneGPT_forsummary_qna/train.txt",
    eval_data_file="/Users/paritoshmacmini/Desktop/personal_projects/antiaging_app/fine_tuneGPT_forsummary_qna/valid.txt",
    line_by_line=False,
    mlm=False,
    mlm_probability=0.15,
    block_size=512,
    overwrite_cache=False,
)
# Training arguments
training_args = TrainingArguments(
    output_dir="/Users/paritoshmacmini/Desktop/personal_projects/antiaging_app/fine_tuneGPT_forsummary_qna/model_new_large/",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=False,    
    per_gpu_train_batch_size=1,
    per_gpu_eval_batch_size=1,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    weight_decay=0.0,
    adam_epsilon=1e-08,
    max_grad_norm=1.0,
    num_train_epochs=5.0,
    max_steps=-1,
    warmup_steps=0,
    logging_dir=None,
    logging_first_step=False,
    logging_steps=1000,
    save_steps=10000,
    save_total_limit=100000,
    no_cuda=False,
    seed=42,
    fp16=False,
    fp16_opt_level="O1",
    local_rank=-1,
)
# Convert dict to objects
model_args = dict2obj(model_args)
data_args = dict2obj(data_args)

# Logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)

# Seed
set_seed(training_args.seed)

# Load tokenizer and model
config = AutoConfig.from_pretrained(
    model_args.model_name_or_path, cache_dir=model_args.cache_dir
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path, cache_dir=model_args.cache_dir
)
model = AutoModelWithLMHead.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
)

# Add special tokens
tokenizer.add_special_tokens({"sep_token": sep})
tokenizer.add_special_tokens({"bos_token": start})
model.resize_token_embeddings(len(tokenizer))

# Load dataset
train_dataset = (
    get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
)

eval_dataset = (
    get_dataset(data_args, tokenizer=tokenizer, evaluate=True)
    if training_args.do_eval
    else None
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset)

# Define model path
model_path = (
    model_args.model_name_or_path
    if model_args.model_name_or_path is not None
    and os.path.isdir(model_args.model_name_or_path)
    else None
)

# Train the model
start = timer()
train_results = trainer.train(model_path=model_path)
end = timer()
trainer.save_model()
if trainer.is_world_master():
    tokenizer.save_pretrained(training_args.output_dir)

# Calculate training time
logger.info(f"Training took {(end - start) / 3600} hours.")


# Evaluation on validation set
logger.info("*** Valid Evaluate ***")
valid_eval_output = trainer.evaluate()
valid_perplexity = math.exp(valid_eval_output["eval_loss"])
valid_result = {"valid_perplexity": valid_perplexity}
output_eval_file = os.path.join(training_args.output_dir, "valid_eval_results_lm.txt")

with open(output_eval_file, "w") as writer:
    logger.info("***** Valid Eval results *****")
    for key in sorted(valid_result.keys()):
        logger.info("  %s = %s", key, str(valid_result[key]))
        writer.write("%s = %s\n" % (key, str(valid_result[key])))


# Evaluation on test set
training_args.do_eval = True
data_args.eval_data_file = "/Users/paritoshmacmini/Desktop/personal_projects/antiaging_app/fine_tuneGPT_forsummary_qna/test.txt"
test_dataset = (
    get_dataset(data_args, tokenizer=tokenizer, evaluate=True)
    if training_args.do_eval
    else None
)
trainer.eval_dataset = test_dataset

logger.info("*** Test Evaluate ***")
test_eval_output = trainer.evaluate()
test_perplexity = math.exp(test_eval_output["eval_loss"])
test_result = {"test_perplexity": test_perplexity}
output_eval_file = os.path.join(training_args.output_dir, "test_eval_results_lm.txt")

with open(output_eval_file, "w") as writer:
    logger.info("***** Test Eval results *****")
    for key in sorted(test_result.keys()):
        logger.info("  %s = %s", key, str(test_result[key]))
        writer.write("%s = %s\n" % (key, str(test_result[key])))


# Evaluation on training set
data_args.eval_data_file = "/Users/paritoshmacmini/Desktop/personal_projects/antiaging_app/fine_tuneGPT_forsummary_qna/train.txt"
test_dataset = (
    get_dataset(data_args, tokenizer=tokenizer, evaluate=True)
    if training_args.do_eval
    else None
)
trainer.eval_dataset = test_dataset

logger.info("*** Train Evaluate ***")
train_eval_output = trainer.evaluate()
train_perplexity = math.exp(train_eval_output["eval_loss"])
train_result = {"train_perplexity": train_perplexity}
output_eval_file = os.path.join(training_args.output_dir, "train_eval_results_lm.txt")

with open(output_eval_file, "w") as writer:
    logger.info("***** Train Eval results *****")
    for key in sorted(train_result.keys()):
        logger.info("  %s = %s", key, str(train_result[key]))
        writer.write("%s = %s\n" % (key, str(train_result[key])))


print(f"Train loss: {train_eval_output['eval_loss']}")
print(f"Valid loss: {valid_eval_output['eval_loss']}")
print(f"Test loss: {test_eval_output['eval_loss']}")
print(f"Train PPL: {train_perplexity}")
print(f"Valid PPL: {valid_perplexity}")
print(f"Test PPL: {test_perplexity}")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
02/07/2023 11:55:02 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transform

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1105, in init
    wi.setup(kwargs)
  File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 276, in setup
    wandb_login._login(
  File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/wandb/sdk/wandb_login.py", line 298, in _login
    wlogin.prompt_api_key()
  File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/wandb/sdk/wandb_login.py", line 221, in prompt_api_key
    key, status = self._prompt_api_key()
  File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/wandb/sdk/wandb_login.py", line 198, in _prompt_api_key
    api = Api(self._settings)
  File "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/wandb/sdk/internal/internal_api.py", line 161, in __init__
    self._settings = Settings(
  File "/opt/homebrew/Caskroom/mi

Exception: problem