In [8]:
import os
import torch
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

data_limit_ratio = 0.01
train_ratio = 0.9

# Set the GPU device (e.g., use the first GPU)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the Romanian dataset with trust_remote_code=True
dataset = load_dataset("oscar", "unshuffled_deduplicated_ro", split="train[:1%]", trust_remote_code=True)

# Load the pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

# Limit the dataset
dataset = dataset.select(range(int(data_limit_ratio * len(dataset))))

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Split the dataset into train and validation sets
train_size = int(train_ratio * len(tokenized_datasets))
train_dataset = tokenized_datasets.select(range(train_size))
eval_dataset = tokenized_datasets.select(range(train_size, len(tokenized_datasets)))

# Set the format for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask"])

# Add labels to the datasets
def add_labels(examples):
    examples["labels"] = examples["input_ids"].clone()
    return examples

train_dataset = train_dataset.map(add_labels, batched=True)
eval_dataset = eval_dataset.map(add_labels, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-romanian",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./gpt2-romanian")
tokenizer.save_pretrained("./gpt2-romanian")

Epoch,Training Loss,Validation Loss
1,No log,2.467347
2,No log,2.341413
3,No log,2.306812


('./gpt2-romanian\\tokenizer_config.json',
 './gpt2-romanian\\special_tokens_map.json',
 './gpt2-romanian\\vocab.json',
 './gpt2-romanian\\merges.txt',
 './gpt2-romanian\\added_tokens.json')

In [16]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the pre-trained GPT-2 model and tokenizer from the web
pretrained_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
pretrained_model = GPT2LMHeadModel.from_pretrained("gpt2")

# Load the fine-tuned model and tokenizer
model_path = "./gpt2-romanian"
finetuned_tokenizer = GPT2Tokenizer.from_pretrained(model_path)
finetuned_model = GPT2LMHeadModel.from_pretrained(model_path)

# Set the pad token to eos token
pretrained_tokenizer.pad_token = pretrained_tokenizer.eos_token
finetuned_tokenizer.pad_token = finetuned_tokenizer.eos_token

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pretrained_model.to(device)
finetuned_model.to(device)

# Function to generate text with top-k sampling and repetition penalty
def generate_text(model, tokenizer, prompt, max_length=50, top_k=50, top_p=0.95, repetition_penalty=1.2):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def compare_generation(pretrained_model, finetuned_model, tokenizer, prompt, max_length=50):
    print("Prompt:", prompt)
    print("Pre-trained model:", generate_text(pretrained_model, tokenizer, prompt, max_length))
    print("Fine-tuned model:", generate_text(finetuned_model, tokenizer, prompt, max_length))
    print()

# Example prompt
prompt1 = "România este o țară frumoasă"
prompt2 = "It's a beautiful day"

# Generate text using the pre-trained and fine-tuned models
compare_generation(pretrained_model, finetuned_model, pretrained_tokenizer, prompt1)
compare_generation(pretrained_model, finetuned_model, pretrained_tokenizer, prompt2)

Prompt: România este o țară frumoasă




Pre-trained model: România este o țară frumoasă, quia esta ei sunt.
The first of these is the one that was called "the Lord's" (cf., Rom 1:16).
Fine-tuned model: România este o țară frumoasă, în cetate de la voritatea sunt aproximat.
Într-o mai nu pe care au

Prompt: It's a beautiful day
Pre-trained model: It's a beautiful day, and I'm so happy to be here. It was such an amazing experience."
- The New York Times
Fine-tuned model: It's a beautiful day, I'm so happy to be here. It was an amazing experience!

