In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import torch
import os

In [None]:
!pip install wget


Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=8f79ce350a81feca68686c78279c3fae85bb0defd7582e2d24bbf691e6b65530
  Stored in directory: /root/.cache/pip/wheels/01/46/3b/e29ffbe4ebe614ff224bad40fc6a5773a67a163251585a13a9
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
# Step 1: Load Pretrained Tokenizer
print("Loading tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token


Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
def load_dataset(file_path, tokenizer, block_size=512):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

import wget
if not os.path.exists("sample_data.txt"):
    wget.download("https://www.gutenberg.org/files/11/11-0.txt", "sample_data.txt")


print("Loading dataset...")
train_dataset = load_dataset("sample_data.txt", tokenizer)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # No masked language modeling since GPT-2 is causal
)


Loading dataset...




In [None]:
# Step 3: Load Pretrained GPT-2 Model
print("Loading model...")
model = GPT2LMHeadModel.from_pretrained("gpt2")

Loading model...


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Step 4: Training Arguments
training_args = TrainingArguments(
    output_dir="./gpt2_model",
    overwrite_output_dir=True,
    num_train_epochs=40,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100
)

In [None]:
# Step 5: Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

print("Starting training...")
trainer.train()

# Save the trained model
trainer.save_model("./gpt2_finetuned")
tokenizer.save_pretrained("./gpt2_finetuned")
print("Training completed. Model saved.")



Starting training...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 Â·Â·Â·Â·Â·Â·Â·Â·Â·Â·


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhari052002[0m ([33mhari052002-object-automation-software-solutions[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,2.7054
200,2.2097
300,1.9534
400,1.6922
500,1.4668
600,1.2698
700,1.0851
800,0.9265
900,0.794
1000,0.6814


Training completed. Model saved.


In [None]:
def generate_text(prompt, max_length=200, temperature=0.2, top_p=0.95, repetition_penalty=1.8):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Detect device
    model.to(device)  # Move model to the correct device
    model.eval()

    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)  # Move input to device

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test Output
print("\nSample Generation:")
prompt = "What did Alice see when she opened the door, and why did she long to go outside?"
print(generate_text(prompt))


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Sample Generation:
What did Alice see when she opened the door, and why did she long to go outside? She heard a little pattering of feet in at her window as soon _she_ came outâ€”and then another knock on Mrs. Whiteâ€™s front porch!
She looked all round but there was nothing else marked upon the house; nevertheless, it seemed quite likely that something had broken into Itself after wandering about like an old goose for some time.

 APTER XI.â€”A Caucus-Race And A Long Tale CHORUS 3.0 After dinner they sat down again very melancholyly (it occurred exactly once every three or four hours,) eating cake and drinking wine, while talking
about their gardeners, which puzzled poor William so much that he went bawling away without speaking till his eyes were full
full of tears: meanwhile Mary Ann told them how brave Heracles really are, saying to one of Them, â€œI am glad you invented such fancy glassware last


In [None]:
!ls -lh ./gpt2_finetuned


total 477M
-rw-r--r-- 1 root root  874 Oct 23 10:38 config.json
-rw-r--r-- 1 root root  119 Oct 23 10:38 generation_config.json
-rw-r--r-- 1 root root 446K Oct 23 10:39 merges.txt
-rw-r--r-- 1 root root 475M Oct 23 10:39 model.safetensors
-rw-r--r-- 1 root root  470 Oct 23 10:39 special_tokens_map.json
-rw-r--r-- 1 root root  556 Oct 23 10:39 tokenizer_config.json
-rw-r--r-- 1 root root 5.7K Oct 23 10:39 training_args.bin
-rw-r--r-- 1 root root 976K Oct 23 10:39 vocab.json


In [None]:
from google.colab import files
import shutil

# Zip the fine-tuned model folder
shutil.make_archive('gpt2_finetuned', 'zip', './gpt2_finetuned')

# Download the zip file
files.download('gpt2_finetuned.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>