In [None]:
!pip install transformers datasets torch peft

In [None]:
import json
import pandas as pd
from datasets import Dataset

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import Trainer, TrainingArguments

## Carregar o Modelo

In [None]:
model_name = "sharpbai/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Preparação do Dados

In [None]:
nome_base = "base_sintetica.json"

with open(nome_base, 'r', encoding='utf-8') as f:
    data = json.load(f)


train_data = []
for item in data:
    train_data.append({
        'input': item['Pergunta'],
        'output': item['Resposta']
    })

dataset = Dataset.from_pandas(pd.DataFrame(train_data))


In [None]:
dataset[0]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['input'], text_target=examples['output'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

## Configuraçaõ do LoRA

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"], 
    task_type=TaskType.CAUSAL_LM
)

lora_model = get_peft_model(model, lora_config)


## Configuração e Treinamento

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=None
)

trainer.train()


## Salvando o Modelo FineTuned

In [None]:
lora_model.save_pretrained("./finetuned-llama")
tokenizer.save_pretrained("./finetuned-llama")

## Carregar o Modelo FineTuned

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
model_path = "./finetuned-llama"

model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

## Gerar Respostas com Prompts

In [None]:
def generate_response(prompt, model, tokenizer):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        **inputs,
        max_length=150,
        num_beams=5, 
        arly_stopping=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
# Exemplo de prompt
prompt = "Qual é a Lei que regula o estágio obrigatório e não-obrigatório?"

resposta = generate_response(prompt, model, tokenizer)
print(resposta)