<a href="https://colab.research.google.com/github/lutfi-haslab/HasGPT/blob/main/HasGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U tqdm torch torchvision transformers

In [None]:
# !pip install transformers datasets torch

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
import torch
import json

# Load dataset from JSON file
with open('qa_data.json', 'r', encoding='utf-8') as f:
    custom_dataset = json.load(f)

# Convert to training format
formatted_data = []
for item in custom_dataset:
    formatted_data.append({
        "text": f"Question: {item['question']}\nAnswer: {item['answer']}<|endoftext|>"
    })

# Create Hugging Face dataset
dataset = Dataset.from_list(formatted_data)
dataset = dataset.train_test_split(test_size=0.1)

# Initialize tokenizer and model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        max_length=128,
        padding="max_length",
        return_tensors="pt"
    )

    # For causal LM, labels should be same as input_ids
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# Load pre-trained model
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=50,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,  # Enable mixed precision training for T4 GPU,
    report_to="none"
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)

# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained("hasgpt")
tokenizer.save_pretrained("hasgpt")

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.0663,0.050152
2,0.0415,0.037761
3,0.039,0.035226
4,0.0366,0.035532
5,0.0362,0.035285
6,0.0357,0.035207
7,0.0354,0.034776
8,0.0389,0.034443
9,0.035,0.034218
10,0.0359,0.034131


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


('hasgpt/tokenizer_config.json',
 'hasgpt/special_tokens_map.json',
 'hasgpt/vocab.json',
 'hasgpt/merges.txt',
 'hasgpt/added_tokens.json',
 'hasgpt/tokenizer.json')

In [None]:
from transformers import pipeline

generator = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

prompt = "Question: what is your name?\nAnswer:"
output = generator(
    prompt,
    max_length=100,
    num_return_sequences=1,
    temperature=0.7,
    no_repeat_ngram_size=2,
    pad_token_id=tokenizer.eos_token_id
)

print(output[0]['generated_text'])

Device set to use cuda:0


Question: what is your name?
Answer: My Name is HasGPT.
