**Author:** Dr. Shahriar Hossain <br>
**Topic of the code:** Fine-tuning GPT2 and retrieving embedding vectors from it <br>
**Video explaining this code:**  <br>
Part 1: https://youtu.be/2bqjzUX9ssE <br>
Part 2: https://youtu.be/ZogRxshfWOQ <br>
**My YT Channel:** https://www.youtube.com/@C4A <br>
**Web:** https://courses.computing4all.com/

In [1]:
import torch

# Check if a GPU is available and if not, use a CPU
device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [2]:
import os

## Keep your training documents in a folder named 'data'
data_dir = "data"
output_file = "all_data.txt"

def is_hidden(filepath):
    return os.path.basename(filepath).startswith('.')

with open(output_file, "w") as outfile:
    for filename in os.listdir(data_dir):
        filepath = os.path.join(data_dir, filename)
        if not is_hidden(filepath):
            with open(filepath) as infile:
                for line in infile:
                    # only write the line if it's not empty
                    # (and, not just whitespace)
                    if line.strip():
                        outfile.write(line)

In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, \
    TrainingArguments, Trainer, DataCollatorWithPadding
from torch.utils.data import Dataset

## GPT-2 Small ('gpt2'): 124 million parameters.
## GPT-2 Medium ('gpt2-medium'): 345 million parameters.
## GPT-2 Large ('gpt2-large'): 774 million parameters.
## GPT-2 XL ('gpt2-xl'): 1.5 billion parameters.


# Load pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Your custom dataset
class CustomDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size):
        self.tokenizer = tokenizer
        with open(file_path, "r") as f:
            self.text = f.read().splitlines()
    def __len__(self):
        return len(self.text)
    def __getitem__(self, idx):
        tokenized_inputs = self.tokenizer(
            self.text[idx],
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt")
        tokenized_inputs["labels"] = tokenized_inputs["input_ids"]
        return tokenized_inputs

# Load data
data = CustomDataset(tokenizer, "all_data.txt", 128)

# Create a data collator that will dynamically pad the sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments and Trainer
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    num_train_epochs=3, # Increse for more training from the fine-tuning data
    learning_rate=1e-4,  # Decrease the learning rate for smaller fine-tuning data
    output_dir='./results',
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=False,
    evaluation_strategy="no",
    remove_unused_columns=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data,
    eval_dataset=None,  # You can specify an evaluation dataset here
    data_collator=data_collator,  # Add the data collator here
)

trainer.train()


Using pad_token, but it is not set yet.


Step,Training Loss
10,2.1337
20,0.1843
30,0.1402
40,0.1125
50,0.0795


TrainOutput(global_step=54, training_loss=0.4986020045148002, metrics={'train_runtime': 4.2672, 'train_samples_per_second': 25.309, 'train_steps_per_second': 12.655, 'total_flos': 7054884864000.0, 'train_loss': 0.4986020045148002, 'epoch': 3.0})

In [13]:
# Ensure your model is in evaluation mode
# to disable dropout layers
model.eval()

# Create a prompt text for the model to complete
prompt_text = "Where is the cat"

# Tokenize the prompt text and convert to tensor
input_ids = tokenizer(prompt_text, return_tensors="pt").input_ids
attention_mask = tokenizer(
    prompt_text, return_tensors="pt").attention_mask

# Move input_ids and attention_mask tensor to GPU
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

# Generate text from the model
output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    pad_token_id=tokenizer.pad_token_id,
    max_length=100,
    num_beams=5,
    temperature=1.5,
    top_k=50,
    do_sample=True  # Enable sampling to consider temperature setting
)

# Decode the generated text back to string
generated_text = tokenizer.decode(output[0],
                                  skip_special_tokens=True)

print(generated_text)


Where is the cat and the dog play together.


In [5]:
## Retireve embeddings
input_text= "cat on the mat"
input_tokens = tokenizer(input_text, return_tensors='pt')

# Ensure tokens are on the same device as the model
input_tokens = {k: v.to(device) for k, v in input_tokens.items()}

# Forward pass, get hidden states
with torch.no_grad():
    outputs = model(**input_tokens, output_hidden_states=True)

# Only take the hidden states (ignore other outputs)
hidden_states = outputs.hidden_states

## If you want the embeddings from the last layer of the model:
last_layer_embeddings = hidden_states[-1]

## the last_layer_embeddings tensor obtained from the
# GPT-2 model's forward method is 3D

# Mean pool the last_layer_embeddings (across the sequence length dimension)
mean_pooled = last_layer_embeddings.mean(dim=1)

mean_pooled_embedding =  mean_pooled.squeeze(dim=0)

In [6]:
print(mean_pooled_embedding)

tensor([ 1.3135e-01,  2.1256e-02, -1.0330e+00, -2.7195e-02, -2.2209e-01,
        -1.6953e-01,  1.4524e-01,  1.7413e-01, -2.0055e-01, -1.3647e-01,
         8.0561e-01, -1.6939e-01,  1.7410e-01, -1.0186e-01, -3.7826e-01,
         4.0437e-01,  6.4153e-01,  1.8059e-01,  1.6409e-01,  3.8082e-01,
         1.6841e-01,  1.2776e-01, -4.6799e-01,  5.0627e-01,  1.2089e-01,
         3.4201e-01,  4.3682e-01, -7.3357e-02, -4.7435e-01,  3.7694e-01,
         1.2478e-01, -2.8324e-01, -1.7686e-01, -6.3647e-03, -4.0675e-01,
         7.4542e-01,  5.4965e+01, -8.5667e-02, -4.0177e-01,  4.9649e-01,
        -3.1704e-01,  1.4997e-01, -1.0412e-02, -3.2140e-01,  1.4524e-01,
        -6.7912e-02,  2.8797e-01,  5.7490e-01, -7.2651e-01,  1.2695e-01,
        -1.1686e-01, -1.6036e-01,  2.2108e-01,  1.0030e-01,  7.0536e-01,
         1.5738e+00,  1.1705e-01, -3.0475e-01,  2.9797e-02,  2.5755e-01,
         1.7395e-01, -2.6267e-02,  3.3774e-01, -4.3090e-02, -1.0916e+00,
        -9.5211e-02, -4.2127e-02,  1.3542e-01, -8.9

In [7]:
print(len(mean_pooled_embedding))

768
