# Creating the model

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset

# Load GPT-2 tokenizer and model
model_name = "gpt2"  # Change to "gpt2-medium" or "gpt2-large" if needed
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token # or '[PAD]' if you prefer
model = GPT2LMHeadModel.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp -r /content/gpt2-business /content/drive/MyDrive/gpt2-business_backup
print("✅ Checkpoints copied to Google Drive!")


Mounted at /content/drive
cp: cannot stat '/content/gpt2-business': No such file or directory
✅ Checkpoints copied to Google Drive!


## Tokenizer function


In [None]:
def tokenize_function(examples):
    # Tokenize the text
    tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    # Create labels (targets for the model) - same as input_ids for causal language modeling
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

## fetching datasets

In [None]:
# Load datasets using streaming mode to prevent storage issues
openwebtext = load_dataset("openwebtext", split="train", streaming=True)
business_data = load_dataset("allenai/c4", data_files="en/c4-train.00000-of-01024.json.gz", split="train")


# Convert streaming datasets to lists (Select only a subset for faster training)
openwebtext = list(openwebtext.take(30000))  # 30K samples
business_data = list(business_data.take(30000))  # 5K samples

# Combine datasets
combined_data = openwebtext + business_data

README.md:   0%|          | 0.00/7.35k [00:00<?, ?B/s]

openwebtext.py:   0%|          | 0.00/2.73k [00:00<?, ?B/s]

The repository for openwebtext contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/openwebtext.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


README.md:   0%|          | 0.00/41.1k [00:00<?, ?B/s]

c4-train.00000-of-01024.json.gz:   0%|          | 0.00/319M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

## Tokenization

In [None]:

tokenized_datasets = [tokenize_function(example) for example in combined_data]


In [None]:
import random
random.shuffle(tokenized_datasets)

train_dataset = tokenized_datasets[:int(0.9 * len(tokenized_datasets))]  # 90% for training
eval_dataset = tokenized_datasets[int(0.9 * len(tokenized_datasets)):]  # 10% for evaluation


## Training the model

In [9]:
from transformers import Trainer, TrainingArguments, TrainerCallback

training_args = TrainingArguments(
    output_dir="./gpt2-business",  # Save trained model here
    per_device_train_batch_size=4,  # Adjust for Colab GPU
    num_train_epochs=3,  # More epochs = better results
    logging_dir="./logs",  # Save logs
    fp16=True,  # Mixed precision for speed
    save_total_limit=2,  # Keep only last 2 checkpoints
    save_steps=5000,  # Save every 5000 steps
    evaluation_strategy="steps",
    eval_steps=5000,  # Evaluate every 5000 steps
    load_best_model_at_end=True,  # Auto-load best model after training
    save_strategy="steps",  # Make sure saving happens during training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Pass the training dataset
    eval_dataset=eval_dataset,
)

# 🚀 **Resume Training from Last Checkpoint if Available**
import os

last_checkpoint = None
if os.path.isdir(training_args.output_dir) and any("checkpoint" in x for x in os.listdir(training_args.output_dir)):
    last_checkpoint = training_args.output_dir
    print(f"✅ Found a checkpoint! Resuming training from {last_checkpoint}")

trainer.train(resume_from_checkpoint=last_checkpoint)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfaadamms[0m ([33mfaadamms-kwame-nkrumah-university-of-science-and-technol[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
5000,2.3375,2.274096
10000,2.3461,2.266402
15000,2.2557,2.263587


Step,Training Loss,Validation Loss
5000,2.3375,2.274096
10000,2.3461,2.266402
15000,2.2557,2.263587
20000,2.2796,2.261977
25000,2.2406,2.255833
30000,2.1802,2.257559
35000,2.2376,2.255338
40000,2.2304,2.253193


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=40500, training_loss=2.2797941804108794, metrics={'train_runtime': 11525.0024, 'train_samples_per_second': 14.056, 'train_steps_per_second': 3.514, 'total_flos': 4.2329309184e+16, 'train_loss': 2.2797941804108794, 'epoch': 3.0})

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Save the model

In [None]:
# model.save_pretrained("./gpt2-business/trained")
# tokenizer.save_pretrained("./gpt2-business/trained")


('./gpt2-business/trained/tokenizer_config.json',
 './gpt2-business/trained/special_tokens_map.json',
 './gpt2-business/trained/vocab.json',
 './gpt2-business/trained/merges.txt',
 './gpt2-business/trained/added_tokens.json')

## Test model


In [9]:
import torch
print(torch.__version__)  # Check PyTorch version
print(torch.cuda.is_available())  #

2.6.0+cpu
False


In [1]:
from transformers import pipeline

# Load the fine-tuned model
generator = pipeline("text-generation", model="./gpt2-business/trained", framework="pt")

# Generate a business-related text sample
output = generator(
    "An AI-powered chatbot designed for customer support. It provides 24/7 assistance, automates responses, reduces workload, and enhances customer satisfaction. It can instantly answer FAQs, process support tickets, and escalate complex issues to human agents when necessary. Describe its features professionally.",
    max_length=150,
    truncation=True,
    temperature=0.35,  # Even more controlled
    top_k=20,  # Limited choices for better coherence
    top_p=0.75,  # Further restricts randomness
    repetition_penalty=2.5,  # Stronger penalty to avoid weird loops
    do_sample=True,
    num_return_sequences=1
)

# generator(
#     "An AI-powered chatbot designed for customer support, providing 24/7 assistance, automating responses, reducing workload, and improving customer satisfaction. Describe its features in a professional and engaging way.",
#     max_length=150,
#     truncation=True,
#     temperature=0.5,  # Even less randomness
#     top_k=20,  # Tighter word selection
#     top_p=0.8,  # Keeps responses on track
#     repetition_penalty=2.2,  # Stronger repetition control
#     do_sample=True,
#     num_return_sequences=1
# )
# generator(
#     "Describe an AI-powered chatbot designed for customer support. It provides 24/7 assistance, automates responses, reduces workload, and improves customer satisfaction.",
#     max_length=150,
#     truncation=True,
#     temperature=0.6,  # Reduces randomness
#     top_k=30,  # Limits vocabulary for coherence
#     top_p=0.85,  # Ensures relevant words
#     repetition_penalty=2.0,  # Reduces repeated words
#     do_sample=True,
#     num_return_sequences=1
# )


# generator(
#     "Write a clear and professional product description for an AI-powered chatbot that helps businesses automate customer support:",
#     max_length=100,
#     truncation=True,
#     temperature=0.7,
#     top_k=50,
#     top_p=0.9,
#     repetition_penalty=1.8,  # Reduce weird phrasing
#     do_sample=True,
#     num_return_sequences=1
# )




print(output)


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


[{'generated_text': 'An AI-powered chatbot designed for customer support. It provides 24/7 assistance, automates responses, reduces workload, and enhances customer satisfaction. It can instantly answer FAQs, process support tickets, and escalate complex issues to human agents when necessary. Describe its features professionally.\nA conversational voice assistant that helps you understand your customers’ needs as well the challenges they face in their daily lives with ease of use (eLearning).'}]
