# Small GPT

In [None]:
!pwd

/content/gdrive/MyDrive/BERT_scale/gpt


### 0. Prepare

In [None]:
# Modify this
MODEL_NAME = 'small_gpt/BEST_110710' # model save path will be '/results/{MODEL_NAME}'
ENABLE_TRAIN = True
ENABLE_TEST = True
TRAIN_SIZE = 60000
VAL_SIZE = 6000
TEST_SIZE = 1000

In [None]:
import os
try:

    from google.colab import drive
    drive.mount('/content/gdrive')

    DRIVE_PATH = '/content/gdrive/MyDrive/BERT_scale/gpt'
    DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\', '')
    if not os.path.exists(DRIVE_PYTHON_PATH):
      %mkdir $DRIVE_PATH

    ## the space in `My Drive` causes some issues,
    ## make a symlink to avoid this
    SYM_PATH = '/content/BERT_scale/gpt'
    print(os.path.exists(SYM_PATH))
    if not os.path.exists(SYM_PATH):
      !ln -s $DRIVE_PATH $SYM_PATH

    running_in_colab = True

    # We already mounted in our google drive.
    # Enter the foler where you put files in:
    %cd '/content/gdrive/MyDrive/BERT_scale/gpt'

    # Current working directory:
    !pwd

    # What files are there:
    !ls


except ModuleNotFoundError:
    running_in_colab = False
    print(
        "I guess you are running locally. If you get this message in Colab, check the files."
    )

Mounted at /content/gdrive
False
ln: failed to create symbolic link '/content/BERT_scale/gpt': No such file or directory
/content/gdrive/MyDrive/BERT_scale/gpt
/content/gdrive/MyDrive/BERT_scale/gpt
GPT.ipynb  logs  results  wandb


In [None]:
!pip install transformers
!pip install datasets peft trl
!pip install evaluate
!pip install rouge_score

In [None]:
import torch
from transformers import (
    OpenAIGPTTokenizer,
    OpenAIGPTLMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType


### 1. Load Model

In [None]:
# Load GPT model
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

In [None]:
# Calculate the number of parameters
total_params = sum(p.numel() for p in model.parameters())

# Print the number of parameters
print(f"Total number of parameters: {total_params}")

Total number of parameters: 116534784


### 2. Load Fine-tuning Dataset

In [None]:
# Load the cnn_dailymail dataset
dataset = load_dataset('cnn_dailymail', '3.0.0')
if ENABLE_TRAIN:
  if TRAIN_SIZE > 0:
    dataset['train'] = dataset['train'].shuffle(seed=0).select(range(TRAIN_SIZE))
  if VAL_SIZE > 0:
    dataset['validation'] = dataset['validation'].shuffle(seed=0).select(range(VAL_SIZE))
if ENABLE_TEST and TEST_SIZE > 0:
  dataset['test'] = dataset['test'].shuffle(seed=0).select(range(TEST_SIZE))

In [None]:
def print_data(target_data):
  for k, v in target_data[0].items():
    print(f"'{k}': ", end = "")
    if type(v) == list:
      print(len(v))
    else:
      print(v)
print_data(train_data)

In [None]:
# Load the tokenizer and the model
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
# tokenizer.pad_token = tokenizer.eos_token # doesn't work
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['article'], return_special_tokens_mask=True)

# Tokenize the dataset
tokenized_datasets_train = dataset['train'].map(
    tokenize_function,
    batched=True,
    remove_columns=['article', 'highlights', 'id']
)
tokenized_datasets_val = dataset['validation'].map(
    tokenize_function,
    batched=True,
    remove_columns=['article', 'highlights', 'id']
)

In [None]:
# Set block size for grouping texts
block_size = 128

# Group texts into chunks of block_size
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples['input_ids'])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [concatenated_examples[k][i:i + block_size]
            for i in range(0, total_length, block_size)]
        for k in concatenated_examples.keys()
    }
    result['labels'] = result['input_ids'].copy()
    return result

# Apply the grouping function to the tokenized dataset
lm_datasets_train = tokenized_datasets_train.map(group_texts, batched=True)
lm_datasets_val = tokenized_datasets_val.map(group_texts, batched=True)


ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (956 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

### 3. Fine-tune the Model

In [None]:
# Configure PEFT with LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=['c_attn']
)

In [None]:
if ENABLE_TRAIN:
    # Load the tokenizer
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

    # Initialize the encoder-decoder model
    # model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    #     'bert-base-uncased', 'bert-base-uncased'
    # )
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

    if MODEL_NAME:
        model = PeftModel.from_pretrained(model, f'./results/{MODEL_NAME}')
        # Verify that only the LoRA parameters are trainable
        for name, param in model.named_parameters():
            if "lora" in name:
                param.requires_grad = True
            else:
                param.requires_grad = False
    else:
        model = get_peft_model(model, peft_config)

# Optionally, print trainable parameters
model.print_trainable_parameters()

In [None]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=500,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets_train,
    eval_dataset=lm_datasets_val,
    data_collator=data_collator,
)

# Start training
checkpoint_path = './results/checkpoint-96473'
trainer.train(resume_from_checkpoint=checkpoint_path)

  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=96473, training_loss=0.0, metrics={'train_runtime': 0.1346, 'train_samples_per_second': 2866438.037, 'train_steps_per_second': 716613.223, 'total_flos': 2.529444150116352e+16, 'train_loss': 0.0, 'epoch': 1.0})

### 4. Test: Summarization

In [None]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
from peft import PeftModel
import torch
# Load the base model
base_model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

# Load the fine-tuned PEFT model
fintuned_model = PeftModel.from_pretrained(base_model, './results/checkpoint-96473')

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
fintuned_model.to(device)
fintuned_model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/479M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OpenAIGPTLMHeadModel(
      (transformer): OpenAIGPTModel(
        (tokens_embed): Embedding(40478, 768)
        (positions_embed): Embedding(512, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x Block(
            (attn): Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
         

In [None]:
def generate_summary(article_text, eval_model=fintuned_model):
    # Define maximum lengths
    max_model_length = 512
    max_new_tokens = 150
    max_input_length = max_model_length - max_new_tokens  # 362 tokens

    # Truncate the article text to fit within the max_input_length
    article_tokens = tokenizer.tokenize(article_text)
    if len(article_tokens) > max_input_length - 50:  # Reserve space for prompt text
        article_tokens = article_tokens[:max_input_length - 50]
        article_text = tokenizer.convert_tokens_to_string(article_tokens)

    # Create a summarization prompt
    prompt = f"Summarize the following article:\n\n{article_text}\n\nSummary:"

    # Tokenize the prompt with truncation
    inputs = tokenizer(
        prompt,
        return_tensors='pt',
        truncation=True,
        max_length=max_input_length,
        padding=True
    ).to(device)

    # Generate the summary using max_new_tokens
    summary_ids = eval_model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=max_new_tokens,  # Generate up to 150 new tokens
        temperature=0.7,
        top_p=0.9,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [None]:
# Load the tokenizer and the model
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
# tokenizer.pad_token = tokenizer.eos_token # doesn't work
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/816k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/458k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


1

In [None]:
from datasets import load_dataset

# Load the test dataset
test_dataset = load_dataset('cnn_dailymail', '3.0.0', split='test')

# Select a subset for testing (e.g., first 20 examples)
test_dataset = test_dataset.shuffle(seed=0).select(range(1000))

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
# Lists to store the generated summaries and reference summaries
generated_summaries = []
reference_summaries = []

# Iterate over the test dataset
for example in test_dataset:
    article = example['article']
    reference = example['highlights']

    # Generate summary
    generated_summary = generate_summary(article, eval_model=fintuned_model)

    # Append to lists
    generated_summaries.append(generated_summary)
    reference_summaries.append(reference)

In [None]:
import evaluate

# Load the ROUGE metric
rouge = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
# Compute ROUGE scores
results = rouge.compute(
    predictions=generated_summaries,
    references=reference_summaries,
    use_stemmer=True
)

# Print the results
print("ROUGE scores:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

ROUGE scores:
rouge1: 0.1936
rouge2: 0.0983
rougeL: 0.1321
rougeLsum: 0.1698
