<a href="https://colab.research.google.com/github/nimaicode/GPT2-Medium-Fine-Tuning/blob/main/gpt-tune-main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
# Check if GPU can be used to run code
device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

Using device: cuda


In [None]:
import os

## Keep your training documents in a folder named 'data'
data_dir = "sample_data"
output_file = "all_data.txt"

def is_hidden(filepath):
    return os.path.basename(filepath).startswith('.')

with open(output_file, "w") as outfile:
    for filename in os.listdir(data_dir):
        filepath = os.path.join(data_dir, filename)
        if not is_hidden(filepath):
            with open(filepath) as infile:
                for line in infile:
                    # only write the line if it's not empty
                    # (and, not just whitespace)
                    if line.strip():
                        outfile.write(line)

# New Section

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, \
    TrainingArguments, Trainer, DataCollatorWithPadding
from torch.utils.data import Dataset

## GPT-2 Small ('gpt2'): 124 million parameters.
## GPT-2 Medium ('gpt2-medium'): 345 million parameters.
## GPT-2 Large ('gpt2-large'): 774 million parameters.
## GPT-2 XL ('gpt2-xl'): 1.5 billion parameters.


# Load pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Your custom dataset
class CustomDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size):
        self.tokenizer = tokenizer
        with open(file_path, "r") as f:
            self.text = f.read().splitlines()
    def __len__(self):
        return len(self.text)
    def __getitem__(self, idx):
        tokenized_inputs = self.tokenizer(
            self.text[idx],
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt")
        tokenized_inputs["labels"] = tokenized_inputs["input_ids"]
        return tokenized_inputs

# Load data
data = CustomDataset(tokenizer, "all_data.txt", 128)

# Create a data collator that will dynamically pad the sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments and Trainer
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    num_train_epochs=400, # Increse for more training from the fine-tuning data
    learning_rate=1e-4,  # Decrease the learning rate for smaller fine-tuning data
    output_dir='./results',
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=False,
    evaluation_strategy="no",
    remove_unused_columns=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data,
    eval_dataset=None,  # You can specify an evaluation dataset here
    data_collator=data_collator,  # Add the data collator here
)

trainer.train()


In [None]:
# Ensure your model is in evaluation mode
# to disable dropout layers
model.eval()

# Create a prompt text for the model to complete
prompt_text = "Who is Igor Kolokov?"

# Tokenize the prompt text and convert to tensor
input_ids = tokenizer(prompt_text, return_tensors="pt").input_ids
attention_mask = tokenizer(
    prompt_text, return_tensors="pt").attention_mask

# Move input_ids and attention_mask tensor to GPU
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

# Generate text from the model
output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    pad_token_id=tokenizer.pad_token_id,
    max_length=100,
    num_beams=5,
    temperature=1.5,
    top_k=50,
    do_sample=True  # Enable sampling to consider temperature setting
)

# Decode the generated text back to string
generated_text = tokenizer.decode(output[0],
                                  skip_special_tokens=True)

print(generated_text)


In [None]:
print(mean_pooled_embedding)

In [None]:
print(len(mean_pooled_embedding))