<a href="https://colab.research.google.com/github/nunesjoab/tech_challenge_4/blob/main/TC3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon May 26 10:19:30 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   29C    P0             44W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
# Instalar dependências necessárias
!pip install transformers datasets torch
!pip install -U transformers

import json
import html
import random
import torch
import zipfile
import gzip
import time
from pathlib import Path
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Função para ler um .json de mesmo nome dentro do .zip
def load_data_from_zip_2(zip_file):
    zip_base = Path(zip_file).stem
    json_filename = f"{zip_base}.json"

    with zipfile.ZipFile(zip_file, 'r') as z:
        if json_filename not in z.namelist():
            raise FileNotFoundError(f"O arquivo {json_filename} não foi encontrado dentro do ZIP.")
        with z.open(json_filename) as f:
            data = [json.loads(line) for line in f]
    return data, data  # usa o mesmo conteúdo como treino e teste

def load_data_from_zip(zip_file):


    with zipfile.ZipFile(zip_file, 'r') as z:
        with z.open('LF-Amazon-1.3M/trn.json.gz') as f:
            with gzip.open(f, 'rt', encoding='utf-8') as json_file:
                train_data = [json.loads(line) for line in json_file]

        with z.open('LF-Amazon-1.3M/tst.json.gz') as f:
            with gzip.open(f, 'rt', encoding='utf-8') as json_file:
                test_data = [json.loads(line) for line in json_file]

    return train_data, test_data

# Mostrar os dados brutos
def display_raw_train_data(train_data):
    print("\nPrimeiras 5 linhas do conjunto de treinamento (raw):")
    for item in train_data[:5]:
        print(item)

# Mostrar exemplos dos dados carregados
def display_loaded_examples(train_data, test_data):
    print("\nExemplos dos dados carregados:")
    display_raw_train_data(train_data)
    print("\nPrimeiras 5 linhas do conjunto de teste:")
    for item in test_data[:5]:
        print(item)

# Pré-processar dados
def preprocess_data(data):
    filtered_data = []
    for item in data:
        title = item.get('title', '').strip()
        content = item.get('content', '').strip()
        title = html.unescape(title)
        content = html.unescape(content)
        if title and content:
            item['title'] = title.encode('utf-8').decode('utf-8')
            item['content'] = content.encode('utf-8').decode('utf-8')
            filtered_data.append(item)
    return filtered_data

# Mostrar dados tratados
def display_examples(train_data, test_data):
    print("\nExemplos dos dados tratados:")
    for item in train_data[:5]:
        print(item)
    print("\nPrimeiras 5 linhas do conjunto de teste:")
    for item in test_data[:5]:
        print(item)

# Inicializa o modelo BERT
def initialize_model():
    return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Fine-tuning do modelo
def fine_tune_model(model, train_data, sample_size=100_000, epochs=1, batch_size=8):
    start = time.time()

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token

    # Amostragem
    sampled = random.sample(train_data, min(sample_size, len(train_data)))

    inputs, labels = [], []
    for item in sampled:
        question = f"What is {item['title']}?"
        context = item['content'].strip()
        if context:
            inputs.append(question + " " + context)
            labels.append(1)

    # Pares negativos
    negatives = random.sample(sampled, len(inputs))
    for item, neg in zip(sampled, negatives):
        if item['uid'] != neg['uid']:
            question = f"What is {item['title']}?"
            context = neg['content'].strip()
            if context:
                inputs.append(question + " " + context)
                labels.append(0)

    # Tokenização
    encodings = tokenizer(inputs, truncation=True, padding=True, return_tensors="pt")
    encodings['labels'] = torch.tensor(labels)

    # Dataset customizado
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, enc):
            self.enc = enc
        def __getitem__(self, idx):
            return {k: v[idx] for k, v in self.enc.items()}
        def __len__(self):
            return len(self.enc['input_ids'])

    dataset = CustomDataset(encodings)

    # Métricas personalizadas
    def compute_metrics(pred):
        y_true = pred.label_ids
        y_pred = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
        acc = accuracy_score(y_true, y_pred)
        return {
            "accuracy": acc,
            "precision": precision,
            "recall": recall,
            "f1": f1
        }

    # Configuração do treinamento
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        logging_dir="./logs",
        logging_steps=1500,
        save_strategy="no",
        disable_tqdm=False,
        report_to="none"
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    end = time.time()
    print(f"\n✅ Fine-tuning concluído em {(end - start)/60:.2f} minutos.", flush=True)

    return model

# Avaliação com perguntas reais
def execute_prompt(model):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model.eval()

    # Get the device of the model
    device = model.device

    prompts = [
        {
            "question": "What is Girls Ballet Tutu Neon Pink?",
            "context": "High quality 3 layer ballet tutu. 12 inches in length"
        },
        {
            "question": "What is Adult Ballet Tutu Yellow?",
            "context": "Elegant adult-sized yellow tutu suitable for performances."
        },
        {
            "question": "What is The Way Things Work: An Illustrated Encyclopedia of Technology?",
            "context": "A visual encyclopedia that explains how modern machines function."
        },
        {
            "question": "What is Mog's Kittens?",
            "context": "Judith Kerr's board book for toddlers about the cat Mog."
        },
        {
            "question": "What is Misty of Chincoteague?",
            "context": "A children's novel by Marguerite Henry about a wild pony."
        }
    ]

    print("\nResultados do prompt:")
    for item in prompts:
        input_text = item['question'] + " " + item['context']
        inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True)

        # Move the input tensors to the same device as the model
        inputs = {k: v.to(device) for k, v in inputs.items()}

        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1).item()
        print(f"Pergunta: {item['question']}")
        print(f"Resposta prevista: {'✅ Correta' if prediction == 1 else '❌ Incorreta'}\n")

# Rest of the code remains the same...

# Avaliação com os dados do teste
def refined_predictions(model, test_data, sample_size=100):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model.eval()

    # Get the device of the model
    device = model.device

    correct = 0
    total = 0
    predictions = []
    references = []

    print("\nPredições refinadas (amostra):\n")

    sampled = [item for item in test_data if item['content'].strip()]
    sampled = random.sample(sampled, min(sample_size, len(sampled)))

    for item in sampled:
        title = item['title']
        content = item['content'].strip()
        question = f"What is {title}?"

        # Simular label: título + conteúdo = correto
        expected = 1

        input_text = question + " " + content
        inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True)

        # Move the input tensors to the same device as the model
        inputs = {k: v.to(device) for k, v in inputs.items()}

        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1).item()

        predictions.append(prediction)
        references.append(expected)

        print(f"Pergunta: {question}")
        print(f"Resposta prevista: {'👍 Correta' if prediction == 1 else '👎 Incorreta'}")

    from sklearn.metrics import accuracy_score, f1_score
    acc = accuracy_score(references, predictions)
    f1 = f1_score(references, predictions)

    print(f"\n📊 Acurácia: {acc:.2f} | F1-score: {f1:.2f}")


# Menu interativo
def main_menu():
    model = None
    filtered_train_data = None
    while True:
        print("\nMenu:")
        print("1. Carregar dados")
        print("2. Filtrar e tratar dados")
        print("3. Inicializar modelo BERT")
        print("4. Executar previsões iniciais")
        print("5. Fine-tuning do modelo")
        print("6. Executar previsões refinadas")
        print("0. Sair")

        choice = input("Escolha uma opção: ")

        if choice == '1':
            zip_file = '/content/drive/MyDrive/Colab Notebooks/LF-Amazon-1.3M.raw.zip'
            global train_data, test_data
            train_data, test_data = load_data_from_zip(zip_file)
            display_loaded_examples(train_data, test_data)

        elif choice == '2':
            filtered_train_data = preprocess_data(train_data)
            display_examples(filtered_train_data, test_data)

        elif choice == '3':
            model = initialize_model()
            print("Modelo BERT inicializado com sucesso.")

        elif choice == '4':
            if model is not None:
                execute_prompt(model)
            else:
                print("Modelo não inicializado.")

        elif choice == '5':
            if model is not None and filtered_train_data:
                model = fine_tune_model(model, filtered_train_data)
                print("Fine-tuning concluído.")
                execute_prompt(model) # Call execute_prompt after fine-tuning
            else:
                print("Faltam dados ou modelo.")

        elif choice == '6':
            if model is not None:
                refined_predictions(model, test_data)
            else:
                print("Treine o modelo antes.")

        elif choice == '0':
            print("Saindo.")
            break

        else:
            print("Opção inválida.")

# Execução do menu
if __name__ == "__main__":
    main_menu()



Menu:
1. Carregar dados
2. Filtrar e tratar dados
3. Inicializar modelo BERT
4. Executar previsões iniciais
5. Fine-tuning do modelo
6. Executar previsões refinadas
0. Sair

Exemplos dos dados carregados:

Primeiras 5 linhas do conjunto de treinamento (raw):
{'uid': '0000031909', 'title': 'Girls Ballet Tutu Neon Pink', 'content': 'High quality 3 layer ballet tutu. 12 inches in length', 'target_ind': [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 111], 'target_rel': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}
{'uid': '0000032034', 'title': 'Adult Ballet Tutu Yellow', 'content': '', 'target_ind': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 33, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71], 'target_rel': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Modelo BERT inicializado com sucesso.

Menu:
1. Carregar dados
2. Filtrar e tratar dados
3. Inicializar modelo BERT
4. Executar previsões iniciais
5. Fine-tuning do modelo
6. Executar previsões refinadas
0. Sair

Resultados do prompt:
Pergunta: What is Girls Ballet Tutu Neon Pink?
Resposta prevista: ❌ Incorreta

Pergunta: What is Adult Ballet Tutu Yellow?
Resposta prevista: ❌ Incorreta

Pergunta: What is The Way Things Work: An Illustrated Encyclopedia of Technology?
Resposta prevista: ❌ Incorreta

Pergunta: What is Mog's Kittens?
Resposta prevista: ❌ Incorreta

Pergunta: What is Misty of Chincoteague?
Resposta prevista: ❌ Incorreta


Menu:
1. Carregar dados
2. Filtrar e tratar dados
3. Inicializar modelo BERT
4. Executar previsões iniciais
5. Fine-tuning do modelo
6. Executar previsões refinadas
0. Sair


Step,Training Loss
1500,0.7071
3000,0.7047
4500,0.704
6000,0.7036
7500,0.7003
9000,0.6997
10500,0.6976
12000,0.6959
13500,0.6957
15000,0.6955



✅ Fine-tuning concluído em 77.61 minutos.
Fine-tuning concluído.

Resultados do prompt:
Pergunta: What is Girls Ballet Tutu Neon Pink?
Resposta prevista: ✅ Correta

Pergunta: What is Adult Ballet Tutu Yellow?
Resposta prevista: ✅ Correta

Pergunta: What is The Way Things Work: An Illustrated Encyclopedia of Technology?
Resposta prevista: ✅ Correta

Pergunta: What is Mog's Kittens?
Resposta prevista: ✅ Correta

Pergunta: What is Misty of Chincoteague?
Resposta prevista: ✅ Correta


Menu:
1. Carregar dados
2. Filtrar e tratar dados
3. Inicializar modelo BERT
4. Executar previsões iniciais
5. Fine-tuning do modelo
6. Executar previsões refinadas
0. Sair

Predições refinadas (amostra):

Pergunta: What is Cars, Energy, Nuclear Diplomacy and the Law: A Reflective Memoir of Three Generations?
Resposta prevista: 👍 Correta
Pergunta: What is Comprehension &amp; Collaboration: Inquiry Circles in Action?
Resposta prevista: 👍 Correta
Pergunta: What is Cressi Playa 2.5mm Men's Front Zip Shorty Wets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
