In [1]:
!pip install transformers datasets accelerate --quiet


In [2]:
# choose instruction_data.jsonl
from google.colab import files
uploaded = files.upload()


Saving instruction_data.jsonl to instruction_data (1).jsonl


In [3]:
# 🧠 Step 3: Load dataset and model
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import pandas as pd

# Load the dataset
df = pd.read_json("instruction_data.jsonl", lines=True)
dataset = Dataset.from_pandas(df)

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token

# Tokenize with labels
def tokenize(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])

# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/58 [00:00<?, ? examples/s]

In [4]:
# ⚙️ Step 4: Set up training
training_args = TrainingArguments(
    output_dir="./gpt2-tasker",
    per_device_train_batch_size=4,  # Increased batch size
    num_train_epochs=5,  # Increased epochs
    save_steps=100,
    logging_steps=50,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [5]:
# 🚀 Step 5: Train!
trainer.train()
# Save the fine-tuned model for pipeline use
trainer.save_model("./gpt2-tasker")
tokenizer.save_pretrained("./gpt2-tasker")


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,0.6171


('./gpt2-tasker/tokenizer_config.json',
 './gpt2-tasker/special_tokens_map.json',
 './gpt2-tasker/vocab.json',
 './gpt2-tasker/merges.txt',
 './gpt2-tasker/added_tokens.json')

In [6]:
# 🧪 Step 6: Try generating a response
from transformers import pipeline

generator = pipeline("text-generation", model="./gpt2-tasker", tokenizer=tokenizer)

prompt = "User: create a task called dentist\nAssistant:\n"
output = generator(prompt, max_length=150, do_sample=True, temperature=0.7, top_p=0.9)[0]["generated_text"]

print(output)


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


User: create a task called dentist
Assistant:
Dentist task added.
[TASK: Dentist | TIME: 07:00]


In [7]:
from transformers import pipeline

# Load the pipeline with the trained model and tokenizer
generator = pipeline("text-generation", model="./gpt2-tasker", tokenizer="./gpt2-tasker", device=0)  # cuda:0

prompt = "User: create a task called dentist\nAssistant:\n"

# Generate text with adjusted parameters
output = generator(
    prompt,
    max_new_tokens=50, # Reduced max_new_tokens
    temperature=0.8, # Increased temperature for more randomness
    top_p=0.95, # Adjusted top_p
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id,
)

generated_text = output[0]["generated_text"]

print(generated_text)

Device set to use cuda:0


User: create a task called dentist
Assistant:
Dentist task added.
[TASK: Dentist | TIME: 11:00]


second try


DistilGPT-2

In [8]:
from transformers import pipeline

generator = pipeline("text-generation", model="distilgpt2", device=0)

output = generator(
    "User: create a task called dentist\nAssistant:\n",
    max_new_tokens=50,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)

print(output[0]["generated_text"])


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


User: create a task called dentist
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:
Assistant:


BART

In [9]:
from transformers import pipeline

# Carrega o modelo BART pré-treinado
bart_generator = pipeline("text2text-generation", model="facebook/bart-large", device=0)

prompt = "User: create a task called dentist\nAssistant:"

output = bart_generator(
    prompt,
    max_length=50,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
)

print(output[0]['generated_text'])


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


User: create a task called dentistAssistant:


final test w gpt2


In [10]:
# Test the fine-tuned model with various prompts
from transformers import pipeline

# Load the pipeline with the trained model and tokenizer
generator = pipeline("text-generation", model="./gpt2-tasker", tokenizer="./gpt2-tasker", device=0)  # cuda:0

# Define test prompts and expected outputs
test_cases = [
    {
        "prompt": "User: create a task to buy groceries\nAssistant:\n",
        "expected": "[TASK: Buy groceries | TIME: 17:00]"
    },
    {
        "prompt": "User: remind me to call the doctor\nAssistant:\n",
        "expected": "[TASK: Call doctor | TIME: 10:30]"
    },
    {
        "prompt": "User: schedule a meeting with the project team\nAssistant:\n",
        "expected": "[TASK: Project team meeting | TIME: 14:00]"
    },
    {
        "prompt": "User: create a task for laundry\nAssistant:\n",
        "expected": "[TASK: Do laundry | TIME: 13:00]"
    },
    {
        "prompt": "User: remind me to water the plants\nAssistant:\n",
        "expected": "[TASK: Water plants | TIME: 09:00]"
    },
    {
        "prompt": "User: create a task to finish the report\nAssistant:\n",
        "expected": "[TASK: Finish report | TIME: 16:00]"
    },
    {
        "prompt": "User: add a task for a dentist appointment\nAssistant:\n",
        "expected": "[TASK: Dentist appointment | TIME: 11:00]"
    },
    {
        "prompt": "User: schedule a workout session\nAssistant:\n",
        "expected": "[TASK: Workout session | TIME: 06:30]"
    },
]

# Run the tests
for case in test_cases:
    output = generator(case["prompt"], max_new_tokens=50, temperature=0.8, top_p=0.95, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    generated_text = output[0]["generated_text"]

    print(f"Prompt: {case['prompt']}")
    print(f"Generated Output: {generated_text.strip()}")
    print(f"Expected Output: {case['expected']}")
    print("-" * 50)


Device set to use cuda:0


Prompt: User: create a task to buy groceries
Assistant:

Generated Output: User: create a task to buy groceries
Assistant:
Groceries task added.
[TASK: Buy groceries | TIME: 08:00]
Expected Output: [TASK: Buy groceries | TIME: 17:00]
--------------------------------------------------
Prompt: User: remind me to call the doctor
Assistant:

Generated Output: User: remind me to call the doctor
Assistant:
Doctor call.
[TASK: Call doctor | TIME: 19:00]
Expected Output: [TASK: Call doctor | TIME: 10:30]
--------------------------------------------------
Prompt: User: schedule a meeting with the project team
Assistant:

Generated Output: User: schedule a meeting with the project team
Assistant:
Meeting scheduled with the project team.
[TASK: Project team meeting | TIME: 14:00]
Expected Output: [TASK: Project team meeting | TIME: 14:00]
--------------------------------------------------
Prompt: User: create a task for laundry
Assistant:

Generated Output: User: create a task for laundry
Assista

# **download the trained model**

In [None]:
!zip -r gpt2-tasker.zip gpt2-tasker


updating: gpt2-tasker/ (stored 0%)
updating: gpt2-tasker/vocab.json (deflated 68%)
updating: gpt2-tasker/merges.txt (deflated 53%)
updating: gpt2-tasker/checkpoint-75/ (stored 0%)
updating: gpt2-tasker/checkpoint-75/vocab.json (deflated 68%)
updating: gpt2-tasker/checkpoint-75/merges.txt (deflated 53%)
updating: gpt2-tasker/checkpoint-75/rng_state.pth (deflated 25%)
updating: gpt2-tasker/checkpoint-75/special_tokens_map.json (deflated 74%)
updating: gpt2-tasker/checkpoint-75/trainer_state.json (deflated 56%)
updating: gpt2-tasker/checkpoint-75/scheduler.pt (deflated 56%)
updating: gpt2-tasker/checkpoint-75/training_args.bin (deflated 52%)
updating: gpt2-tasker/checkpoint-75/optimizer.pt (deflated 8%)
updating: gpt2-tasker/checkpoint-75/model.safetensors (deflated 7%)
updating: gpt2-tasker/checkpoint-75/tokenizer_config.json (deflated 56%)
updating: gpt2-tasker/checkpoint-75/config.json (deflated 51%)
updating: gpt2-tasker/checkpoint-75/generation_config.json (deflated 24%)
updating: gp