In [1]:
!pip install transformers torch tensorflow pandas tqdm



In [2]:
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import re
import pickle
from tqdm import tqdm
from transformers import DistilBertTokenizer, DistilBertModel

In [3]:
BATCH_SIZE = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

Device: cuda


### **Limpeza**

In [4]:
def clean_text_light(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"\(?reuters\)?\s*-?", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


### **Truncamento**

In [5]:
def smart_truncate(text, tokenizer, max_len):
    tokens = tokenizer.encode(text, add_special_tokens=True)
    if len(tokens) > max_len:
        tokens = tokens[:max_len]
    return tokens

### **Carregar Configuração e Modelo**

In [6]:
# Arquivos
MODEL_FILE = "bilstm_classifier.keras"
CONFIG_FILE = "model_metadata.pkl"
TOKENIZER_FOLDER = "tokenizer.json"

# Carregar metadados
with open(CONFIG_FILE, "rb") as f:
    configuracoes = pickle.load(f)

best_thresh = configuracoes["best_threshold"]
MAX_LEN = configuracoes["max_len"]

print("Threshold:", best_thresh)
print("Max Len:", MAX_LEN)

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(TOKENIZER_FOLDER)

# Classificador BiLSTM
model = tf.keras.models.load_model(MODEL_FILE)

# BERT
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)
bert_model.eval()

Threshold: 0.1
Max Len: 512


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertModel LOAD REPORT from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSelfAttention(
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

### **Carregar Test.csv**

In [7]:
df_test = pd.read_csv("test.csv")

df_test["title"] = df_test["title"].fillna("")
df_test["text"] = df_test["text"].fillna("")

df_test["text_light"] = (
    df_test["title"] + " " + df_test["text"]
).apply(clean_text_light)

X_test_text = df_test["text_light"].values

### **Inferência**

In [8]:
todas_probabilidades = []

for i in tqdm(range(0, len(X_test_text), BATCH_SIZE)):
    batch_texts = X_test_text[i:i+BATCH_SIZE]

    input_ids, attention_masks = [], []

    for text in batch_texts:
        ids = smart_truncate(text, tokenizer, MAX_LEN)

        if len(ids) < MAX_LEN:
            ids += [tokenizer.pad_token_id] * (MAX_LEN - len(ids))

        mask = [1 if token != tokenizer.pad_token_id else 0 for token in ids]

        input_ids.append(ids)
        attention_masks.append(mask)

    input_ids_pt = torch.tensor(input_ids).to(device)
    attention_masks_pt = torch.tensor(attention_masks).to(device)

    with torch.no_grad():
        outputs = bert_model(
            input_ids_pt,
            attention_mask=attention_masks_pt
        )

        embeddings = outputs.last_hidden_state

        mask_expanded = attention_masks_pt.unsqueeze(-1).expand(embeddings.size()).float()
        embeddings = embeddings * mask_expanded

    preds = model.predict(embeddings.cpu().numpy(), verbose=0)

    todas_probabilidades.extend(preds.flatten())

100%|██████████| 179/179 [02:35<00:00,  1.15it/s]


In [9]:
y_pred_test = (
    np.array(todas_probabilidades) > best_thresh
).astype(int)

submission = pd.DataFrame({
    "id": df_test["id"],
    "label": y_pred_test
})

submission.to_csv("submission.csv", index=False)