In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # GPT-2 model is small, you can switch to a larger version like 'gpt2-medium' or 'gpt2-large'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Input text
input_text = "I am large language model "

# Tokenize input
inputs = tokenizer(input_text, return_tensors="pt")

In [3]:
# Generate text with sampling enabled
output = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=50,  # You can adjust this to control the length of the output
    num_return_sequences=1,  # Number of output sequences you want
    no_repeat_ngram_size=2,  # This prevents repeating n-grams
    top_p=0.92,  # Top-p sampling for diversity
    temperature=0.85,  # Adjust the randomness of the output
    do_sample=True,  # Enable sampling
    pad_token_id=50256
)

# Decode output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text: ", generated_text)


Generated Text:  I am large language model  (in particular, I use the F# standard) and as such I am very happy that it is available. It is not yet perfect, but it will be very useful to those of you who want to learn


In [4]:
!pip install accelerate -U



In [5]:
from transformers import Trainer, TrainingArguments, GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset

# dataset = load_dataset("wikitext", "wikitext-103-raw-v1")
dataset = load_dataset("wikitext", "wikitext-103-raw-v1")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

# Preprocess the dataset
def preprocess_function(examples):
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = inputs["input_ids"].copy()  # Set labels to input_ids
    return inputs

# Use a smaller subset of the dataset (first 100 samples)
small_dataset = dataset["train"].select(range(50))

# Tokenize the smaller dataset
tokenized_datasets = small_dataset.map(preprocess_function, batched=True, remove_columns=["text"])

# Load model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory
    overwrite_output_dir=True,       # Overwrite existing files
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=4,   # Batch size per device during training
    save_steps=10_000,               # Save checkpoint every 10,000 steps
    logging_steps=500,               # Log every 500 steps
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir="./logs",            # Directory for storing logs
    report_to="none",                # Disable reporting to third-party services like TensorBoard
    remove_unused_columns=False,     # Prevent error for unmatched columns
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # Pre-trained model
    args=training_args,                  # Training arguments
    train_dataset=tokenized_datasets,    # Preprocessed smaller training dataset
)

# Fine-tune the model
trainer.train()

Map: 100%|██████████| 50/50 [00:00<00:00, 251.35 examples/s]
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=39, training_loss=6.769061748798077, metrics={'train_runtime': 633.5036, 'train_samples_per_second': 0.237, 'train_steps_per_second': 0.062, 'total_flos': 39193804800000.0, 'train_loss': 6.769061748798077, 'epoch': 3.0})

In [6]:
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

('fine_tuned_model\\tokenizer_config.json',
 'fine_tuned_model\\special_tokens_map.json',
 'fine_tuned_model\\vocab.json',
 'fine_tuned_model\\merges.txt',
 'fine_tuned_model\\added_tokens.json')

In [7]:
model = GPT2LMHeadModel.from_pretrained("fine_tuned_model")
tokenizer = GPT2Tokenizer.from_pretrained("fine_tuned_model")

In [9]:
def generate_response(input_text):
    inputs = tokenizer(input_text, return_tensors="pt")
    output = model.generate(
        inputs["input_ids"], 
        attention_mask=inputs["attention_mask"],  # Pass attention_mask
        max_length=50,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,  # Enable sampling
        pad_token_id=50256 
    )
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

while True:
    user_input = input("You: ")
    if user_input.lower() == "quit":
        break
    response = generate_response(user_input)
    print("Bot: ", response)

Bot:  It met with positive sales in Japan,  and Japan was a huge win for the company's new line of premium ebooks.  
As of January 2013, Amazon Canada was the largest e-reader market in the world, and the
Bot:  The game began development in 2002,  (before it even started on Venn diagram). Initially, the game was supposed to be about 1.5/5.
In the spring of 2002 there was also quite a lot of news about the
