In [1]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch
from tqdm import tqdm

# 3. Carregar o tokenizer e o modelo BERT para português
model_name = "neuralmind/bert-large-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained("modelo/mod02_bert_final/")

In [5]:
# 1. Definir o Dataset customizado para treino/validação
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128, is_train=True):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_train = is_train

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        text = self.dataframe.loc[index, 'ds_bem_candidato_2']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].squeeze(),  # remove dimensao extra
            'attention_mask': encoding['attention_mask'].squeeze()
        }
        if self.is_train:
            label = self.dataframe.loc[index, 'y']
            item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# Dataset para aplicação (sem labels)
class AppDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        text = self.dataframe.loc[index, 'ds_bem_candidato_2']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

In [2]:
# 8. Aplicar o modelo na base de dados de aplicação
file_parquet = 'bases/bd02_final.parquet'
df = pd.read_parquet(file_parquet)
#df = df.sample(n=10000, random_state=42)

In [3]:
# 4. Configurar o dispositivo (GPU com CUDA, se disponível)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device

device(type='cuda')

In [6]:
# 5. Criar os datasets e dataloaders
max_len = 128 # número de tokens que ele vai transformar cada texto
batch_size = 128

app_dataset = AppDataset(df, tokenizer, max_len=max_len)
app_loader = DataLoader(app_dataset, batch_size=batch_size)

In [7]:
model.to(device)
model.eval()
all_preds = []
with torch.no_grad():
    for batch in tqdm(app_loader, desc="Predizendo na base de aplicação"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())

df['y_pred'] = all_preds
df.to_parquet("output/base_final_2010_2022.parquet", index=False)
print("Predições salvas'")


Predizendo na base de aplicação: 100%|█████████████████████████████████████████| 25304/25304 [7:03:18<00:00,  1.00s/it]


Predições salvas'
