In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [37]:
!pip install transformers datasets torch accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.3


In [91]:
import os
import json
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [92]:
file_path = "/content/drive/MyDrive/Colab Notebooks/trn.json"
data = []
with open(file_path, "r") as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [94]:
# Reduzindo o tamanho do dataset para viabilizaro o treinamento
df = df.sample(frac=0.005, random_state=42)
df[["title", "content"]].head(10)

Unnamed: 0,title,content
1058838,UGG Women's Bailey Button Boot,
396213,"Naruto, Vol. 21: Pursuit",Masashi Kishimoto made his debut in Weekly Sho...
1522632,URO Parts 61 31 8 361 787 Water Temperature Sw...,This part is compatible with Vehicle:318i- 199...
1410325,,"InThe Velvet Vampire, a couple accepts an invi..."
546471,Aston University 'Branding' Bundle: Creating P...,"'The Logical structure, conceptual clarity and..."
635422,Through the Looking Glass,
541674,Human Growth and Development,"""An excellent text. Engages the reader and pre..."
1556793,Wasgij 1000 Pv Mystery Puzzle - Dog Show,Since 1891 we've been making the finest puzzle...
1220258,Boys Capes Combo Value Set Bat Superhero Pirat...,
835634,StrongArm 4992 Buick Rendezvous Tailgate 2002-05,StrongArm gas charged lift supports are custom...


In [95]:
# Tokenizador
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Definir o pad_token como eos_token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["title"], examples["content"], padding="max_length", truncation=True)

In [96]:
# Criar dataset Hugging Face
dataset = Dataset.from_pandas(df)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

In [97]:
from transformers import DataCollatorForLanguageModeling

# DataCollator para treinamento de modelos de linguagem causal
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Modelo para Fine-Tuning
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)

In [98]:
# Configuração de treinamento
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/tech-challenge-3/results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/tech-challenge-3/logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    data_collator=data_collator,
)

# Treinamento
trainer.train()

# Salvar modelo
model.save_pretrained("/content/drive/MyDrive/tech-challenge-3/fine_tuned_llama")
tokenizer.save_pretrained("/content/drive/MyDrive/tech-challenge-3/fine_tuned_llama")



Epoch,Training Loss,Validation Loss
1,4.5977,4.178246
2,4.567,3.923922
3,3.8282,3.830549


('/content/drive/MyDrive/tech-challenge-3/fine_tuned_llama/tokenizer_config.json',
 '/content/drive/MyDrive/tech-challenge-3/fine_tuned_llama/special_tokens_map.json',
 '/content/drive/MyDrive/tech-challenge-3/fine_tuned_llama/vocab.json',
 '/content/drive/MyDrive/tech-challenge-3/fine_tuned_llama/merges.txt',
 '/content/drive/MyDrive/tech-challenge-3/fine_tuned_llama/added_tokens.json',
 '/content/drive/MyDrive/tech-challenge-3/fine_tuned_llama/tokenizer.json')

In [115]:
# Método para testar o modelo
def test_model(model_path, sample_text):
    model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
    inputs = tokenizer(sample_text, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=100)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

sample_text = "Bulk Pirate Jewels and Gems"

In [116]:
# Teste com o modelo original
original_prediction = test_model("gpt2", sample_text)
print(f"Generated text with original model: {original_prediction}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text with original model: Bulk Pirate Jewels and Gems

The following items are not available in the game.

The following items are not available in the game.

The following items are not available in the game.

The following items are not available in the game.

The following items are not available in the game.

The following items are not available in the game.

The following items are not available in the game.

The following items are not available in


In [117]:
# Teste com o modelo após fine-tuning
fine_tuned_prediction = test_model("/content/drive/MyDrive/tech-challenge-3/fine_tuned_llama", sample_text)
print(f"Generated text with fine-tuned model: {fine_tuned_prediction}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text with fine-tuned model: Bulk Pirate Jewels and Gems

The Pirate Jewels and Gems are a unique set of jewelry that are made from the finest materials and are made from the finest materials. The Pirate Jewels and Gems are made from the finest materials and are made from the finest materials. The Pirate Jewels and Gems are made from the finest materials and are made from the finest materials. The Pirate Jewels and Gems are made from the finest materials and are made from the finest materials. The Pirate Jewels
