# Title Layer

## Carregando os dados e modelos

### Modelo de word embeddings do FastText

In [2]:
import numpy as np

vec_path = '../../models/fast-text/cc.pt.300.vec'

def load_fasttext_vec(path):
    vecs = {}
    with open(path, encoding="utf8") as f:
        first = f.readline().split()

        if len(first)==2 and first[0].isdigit():
            embed_dim = int(first[1])
        else:
            word, *vals = first
            vecs[word] = np.array(vals, dtype="float32")
        for line in f:
            word, *vals = line.rstrip().split(" ")
            vecs[word] = np.array(vals, dtype="float32")
    return vecs, embed_dim

ft_vecs, word_embed_dim = load_fasttext_vec(vec_path)

### Dataset de tasks

In [3]:
import pandas as pd

df = pd.read_csv(
    '../../data/tasks_summary.csv',
    encoding='utf-8',
    sep=',',
)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4061 entries, 0 to 4060
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SUMMARY    4061 non-null   object 
 1   DTSTART    4061 non-null   object 
 2   DTEND      4061 non-null   object 
 3   CALENDAR   4061 non-null   object 
 4   DURATION   4061 non-null   float64
 5   CREATED    4061 non-null   object 
 6   TASK       4034 non-null   object 
 7   TIME_SLOT  4061 non-null   int64  
 8   YEAR_WEEK  4061 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 285.7+ KB
None


## Words

### Tokenizando as palavras dos titles

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
import re

STOP_WORDS = {
    "a","o","as","os","um","uma","uns","umas",
    "de","do","da","dos","das",
    "em","no","na","nos","nas",
    "para","por","que","e","com","sem",
    "ao","aos","à","às",
    "que"
}

def remove_stopwords(text: str) -> str:
    # lowercase e split simples
    words = re.sub(r"[^\wçáéíóúâêîôûãõü]+", " ", text.lower()).split()
    # filtra stop_words e tokens muito curtos, números
    cleaned = [
        w for w in words
        if w not in STOP_WORDS
        and len(w) > 1
        and not w.isdigit()
    ]
    return " ".join(cleaned)


task_titles = df["TASK"].astype(str).tolist()

clean_texts = [remove_stopwords(t) for t in task_titles]

word_tokenizer = Tokenizer(
    num_words=None,         # ou limite de vocabulário
    lower=True,             # converte tudo p/ minúsculas
    oov_token="<unk>",      # índice fixo p/ OOV
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n0123456789'
)

word_tokenizer.fit_on_texts(clean_texts)

tokenized_words = word_tokenizer.word_index
print(f'Vocab: {tokenized_words}')

word_seqs = word_tokenizer.texts_to_sequences(clean_texts)
print(f'Words:\n{clean_texts[:5]}')
print(word_seqs[:5])


Vocab: {'<unk>': 1, 'ir': 2, 'aula': 3, 'academia': 4, 'trabalhar': 5, 'inf': 6, 'manha': 7, 'tomar': 8, 'lanchar': 9, 'fazer': 10, 'assistir': 11, 'almocar': 12, 'cafe': 13, 'tarde': 14, 'pre': 15, 'treino': 16, 'jantar': 17, 'disciplina': 18, 'muay': 19, 'thai': 20, 'sistemas': 21, 'dados': 22, 'software': 23, 'programacao': 24, 'analise': 25, 'teste': 26, 'inteligencia': 27, 'artificial': 28, 'estruturas': 29, 'calculo': 30, 'mat': 31, 'consulta': 32, 'introducao': 33, 'analisadores': 34, 'lexicos': 35, 'sintaticos': 36, 'gerencia': 37, 'projetos': 38, 'informatica': 39, 'psicologo': 40, 'numerica': 41, 'adm': 42, 'financas': 43, 'principios': 44, 'engenharia': 45, 'operacionais': 46, 'probabilidade': 47, 'computacional': 48, 'desenvolvimento': 49, 'jogos': 50, 'apple': 51, 'developer': 52, 'academy': 53, 'computadores': 54, 'interacao': 55, 'humano': 56, 'computador': 57, 'medicao': 58, 'reativos': 59, 'algoritmos': 60, 'orientada': 61, 'objetos': 62, 'redes': 63, 'discretas': 64, 

### Embedding do vocabulário de palavras

Se a palavra que estou fazendo embedding não estiver no vocabulário, eu estou deixando seu embedding como zero. Isso é uma escolha por saber que virá o Char-CNN para compensar esses vetores que não estão no vocabulário. 


In [5]:
num_words = len(tokenized_words) + 1
print(f'Number of words: {num_words}')

print(f'Embedding dimensions: {word_embed_dim}')

word_embed_matrix = np.zeros((num_words, word_embed_dim), dtype="float32")
print(f'Embedding matrix shape: {word_embed_matrix.shape}')

for word, idx in tokenized_words.items():
    if word in ft_vecs:
        word_embed_matrix[idx] = ft_vecs[word]

print(f'Embedding matrix filled: {np.count_nonzero(word_embed_matrix)} non-zero entries')
# Find words that are zero (not in ft_vecs)
zero_words = []
for word, idx in tokenized_words.items():
    if word not in ft_vecs:
        zero_words.append(word)

print(f'Words not found in FastText (zero embeddings): {len(zero_words)}')
print(f'Zero words: {zero_words}')


import torch
import torch.nn as nn

word_embeddings = nn.Embedding(
    num_embeddings=num_words,
    embedding_dim=word_embed_dim,
    padding_idx=0
)
word_embeddings.weight.data.copy_(torch.from_numpy(word_embed_matrix))
word_embeddings.weight.requires_grad = False

Number of words: 269
Embedding dimensions: 300
Embedding matrix shape: (269, 300)
Embedding matrix filled: 76425 non-zero entries
Words not found in FastText (zero embeddings): 13
Zero words: ['<unk>', 'lexicos', 'sintaticos', 'wwdc', 'birusamba', 'probcomp', 'gmtk', 'gamejam', 'intercriar', 'guelt', 'gmkt', 'outsystems', 'benchimol']


## Preparando para o Bi-LSTM

### Colocando padding nas sequências


In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_seq_len = max(len(seq) for seq in word_seqs)
print(f'Max sequence length: {max_seq_len}')

padded_word_seqs = pad_sequences(
    word_seqs,
    maxlen=max_seq_len,
    padding='post',
    truncating='post',
    value=0
)

print(f'Padded shape: {padded_word_seqs.shape}')
print(f'Padded sequences:\n{padded_word_seqs[:5]}')


Max sequence length: 9
Padded shape: (4061, 9)
Padded sequences:
[[ 8 13  7  0  0  0  0  0  0]
 [ 9  7  0  0  0  0  0  0  0]
 [12  0  0  0  0  0  0  0  0]
 [ 9 14  0  0  0  0  0  0  0]
 [ 8 15 16  0  0  0  0  0  0]]


### Converter para tensor PyTorch

In [7]:
import torch

word_inputs = torch.LongTensor(padded_word_seqs)
print("Tensor shape:", word_inputs.shape)


Tensor shape: torch.Size([4061, 9])


### Aplicando os embeddings ao tensor

In [8]:
embedded_word_inputs = word_embeddings(word_inputs)
print('After applying embeddings:', embedded_word_inputs.shape)

After applying embeddings: torch.Size([4061, 9, 300])


# Characters

### Criando o vocabulário tokenizado de caracteres

In [9]:
char_tokenizer = Tokenizer(
    char_level=True,
    lower=True,
    oov_token="<unk>",
    filters=''
)

char_tokenizer.fit_on_texts(task_titles)

tokenized_chars = char_tokenizer.word_index
print(f'Char vocab: {tokenized_chars}')

tokenized_titles = [t.split() for t in clean_texts]
print(f'Tokenized titles:\n{tokenized_titles[:5]}')

char_seqs = [
    [char_tokenizer.texts_to_sequences([word])[0] for word in words]
    for words in tokenized_titles
]
print(f'Character sequences:\n{char_seqs[:5]}')

Char vocab: {'<unk>': 1, 'a': 2, ' ': 3, 'r': 4, 'i': 5, 'e': 6, 'd': 7, 'o': 8, 't': 9, 'l': 10, 'n': 11, 'c': 12, 'm': 13, 's': 14, 'u': 15, 'h': 16, 'p': 17, 'f': 18, '1': 19, 'b': 20, '-': 21, 'g': 22, '0': 23, 'j': 24, '3': 25, '6': 26, 'v': 27, 'z': 28, '2': 29, '4': 30, 'y': 31, '7': 32, '8': 33, 'w': 34, '9': 35, 'x': 36, '5': 37, 'q': 38, 'k': 39, '.': 40, '!': 41, '&': 42}
Tokenized titles:
[['tomar', 'cafe', 'manha'], ['lanchar', 'manha'], ['almocar'], ['lanchar', 'tarde'], ['tomar', 'pre', 'treino']]
Character sequences:
[[[9, 8, 13, 2, 4], [12, 2, 18, 6], [13, 2, 11, 16, 2]], [[10, 2, 11, 12, 16, 2, 4], [13, 2, 11, 16, 2]], [[2, 10, 13, 8, 12, 2, 4]], [[10, 2, 11, 12, 16, 2, 4], [9, 2, 4, 7, 6]], [[9, 8, 13, 2, 4], [17, 4, 6], [9, 4, 6, 5, 11, 8]]]


### Embedding do vocabulário de caracteres

In [10]:
num_chars = len(tokenized_chars) + 1
print(f'Number of chars: {num_chars}')

char_embed_dim = 50             # dimensão do embedding de cada caractere
char_embedding = nn.Embedding(
    num_embeddings=num_chars,
    embedding_dim=char_embed_dim,
    padding_idx=0
)

Number of chars: 43


## Preparando para o Char-CNN

### Colocando padding nos caracteres das palavras dos títulos


In [11]:
max_char_len = max(len(seq) for doc in char_seqs for seq in doc)
print(f"max_char_len = {max_char_len}, max_seq_len = {max_seq_len}")

padded_char_seqs = []
for doc in char_seqs:
    padded_words = pad_sequences(
        doc,
        maxlen=max_char_len,
        padding='post',
        truncating='post',
        value=0
    )

    if padded_words.shape[0] < max_seq_len:
        # Preenche com zeros até o tamanho máximo da sequência
        pad_docs = np.zeros((max_seq_len - padded_words.shape[0], max_char_len), dtype=int)
        padded_words = np.vstack((padded_words, pad_docs))

    else:
        padded_words = padded_words[:max_seq_len]

    padded_char_seqs.append(padded_words)

padded_char_seqs = np.stack(padded_char_seqs)
print(f'Padded char sequences shape: {padded_char_seqs.shape}')
print(f'Padded char sequences:\n{padded_char_seqs[:5]}')

max_char_len = 15, max_seq_len = 9
Padded char sequences shape: (4061, 9, 15)
Padded char sequences:
[[[ 9  8 13  2  4  0  0  0  0  0  0  0  0  0  0]
  [12  2 18  6  0  0  0  0  0  0  0  0  0  0  0]
  [13  2 11 16  2  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]

 [[10  2 11 12 16  2  4  0  0  0  0  0  0  0  0]
  [13  2 11 16  2  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]

 [[ 2 10 13  

### Converter para tensor PyTorch

In [12]:
import torch

char_inputs = torch.LongTensor(padded_char_seqs)
print("Char tensor shape:", char_inputs.shape)

Char tensor shape: torch.Size([4061, 9, 15])


### Aplicando os embeddings ao tensor

In [13]:
embedded_char_inputs = char_embedding(char_inputs)
print('After applying char embeddings:', embedded_char_inputs.shape)

After applying char embeddings: torch.Size([4061, 9, 15, 50])


## Aplicando as convoluções

In [16]:
import torch.nn.functional as F

# filtros que capturam n‑gramas de caracteres
filter_widths = [3, 4, 5]            # tamanhos de “janela” de caracteres
num_filters   = [100, 100, 100]      # quantos filtros para cada janela

conv3 = nn.Conv1d(in_channels=char_embed_dim, out_channels=100, kernel_size=3)
conv4 = nn.Conv1d(in_channels=char_embed_dim, out_channels=100, kernel_size=4)
conv5 = nn.Conv1d(in_channels=char_embed_dim, out_channels=100, kernel_size=5)

activation = nn.ReLU()

B, S, L, D = embedded_char_inputs.size()

conv_input = (
    embedded_char_inputs
    .view(B * S, L, D)
    .permute(0, 2, 1)
)

o3 = activation(conv3(conv_input))
o4 = activation(conv4(conv_input))
o5 = activation(conv5(conv_input))

print(f'Conv outputs shapes: {o3.shape}, {o4.shape}, {o5.shape}')

p3 = F.max_pool1d(o3, kernel_size=o3.size(2)).squeeze(2)
p4 = F.max_pool1d(o4, kernel_size=o4.size(2)).squeeze(2)
p5 = F.max_pool1d(o5, kernel_size=o5.size(2)).squeeze(2)

print(f'Pooled outputs shapes: {p3.shape}, {p4.shape}, {p5.shape}')

cat = torch.cat([p3, p4, p5], dim=1)

char_cnn_dim = sum(num_filters)
char_repr = cat.view(B, S, char_cnn_dim)
print(f'Character CNN representation shape: {char_repr.shape}')

Conv outputs shapes: torch.Size([36549, 100, 13]), torch.Size([36549, 100, 12]), torch.Size([36549, 100, 11])
Pooled outputs shapes: torch.Size([36549, 100]), torch.Size([36549, 100]), torch.Size([36549, 100])
Character CNN representation shape: torch.Size([4061, 9, 300])


## Combinar os outputs de words e characters

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence

combined = torch.cat([embedded_word_inputs, char_repr], dim=2)
print("combined.shape:", combined.shape)

# Calcular os comprimentos reais das sequências (excluindo padding)
lengths = torch.sum(word_inputs != 0, dim=1)
print("lengths:", lengths[:10])  # mostrar os primeiros 10 comprimentos

# Verificar se todos os comprimentos são válidos (> 0)
assert torch.all(lengths > 0), "Algumas sequências têm comprimento zero"

# Criar packed sequence
packed = pack_padded_sequence(
    combined,
    lengths.cpu(),
    batch_first=True,
    enforce_sorted=False
)

combined.shape: torch.Size([4061, 9, 600])


In [18]:
# supondo que word_inputs e char_repr já existam:

# 1a) Máscara de palavras reais (True onde há palavra, False em <pad>)
word_mask = (word_inputs != 0)        # shape: (B, S)

# 1b) Valores médios de char_repr em posições reais vs paddings
real_vals = char_repr[word_mask]      # todo char_repr nos timesteps válidos
pad_vals  = char_repr[~word_mask]     # char_repr nos timesteps de padding

print("Real positions — mean, std:", real_vals.mean().item(), real_vals.std().item())
print("Pad positions  — mean, std:", pad_vals.mean().item(),  pad_vals.std().item())


Real positions — mean, std: 0.5775669813156128 0.37232932448387146
Pad positions  — mean, std: 0.017469331622123718 0.02378946542739868


In [20]:
# 1) Inspecionar alguns vetores de char_repr:
#    título 0, palavra 0 (real) vs. título 0, palavra 3 (pad)
print("Real vec [0,0]:", char_repr[0,0,:10].tolist())   # primeiros 10 dims
print("Pad  vec [0,3]:", char_repr[0,3,:10].tolist())

# 2) Verificar valores máximos nos pads:
pad_vals = char_repr[~(word_inputs!=0)]  # todos os char_repr onde word_inputs == 0
print("Max abs pad‑vals:", pad_vals.abs().max().item())

# 3) Conferir se os biases das convoluções não estão gerando offset:
print("conv3.bias:", conv3.bias.data.mean().item(), conv3.bias.data.std().item())
print("conv4.bias:", conv4.bias.data.mean().item(), conv4.bias.data.std().item())
print("conv5.bias:", conv5.bias.data.mean().item(), conv5.bias.data.std().item())


Real vec [0,0]: [0.31473273038864136, 0.9037463068962097, 0.30907315015792847, 0.5938639044761658, 0.9759604334831238, 0.7703021168708801, 0.48185187578201294, 0.94762122631073, 0.06664206087589264, 0.3470439016819]
Pad  vec [0,3]: [0.0, 0.0, 0.0, 0.0, 0.04989437758922577, 0.052361056208610535, 0.03593650460243225, 0.0, 0.06664206087589264, 0.0]
Max abs pad‑vals: 0.07796327769756317
conv3.bias: -0.0009105079807341099 0.0469384640455246
conv4.bias: -0.0038692683447152376 0.04040978103876114
conv5.bias: -0.00031745657906867564 0.039121147245168686


In [21]:
# Correção: Zerar char_repr em posições de padding
# A máscara indica onde há palavras reais (True) vs padding (False)
word_mask = (word_inputs != 0)  # shape: (B, S)

# Expandir a máscara para a dimensão do char_repr
char_mask = word_mask.unsqueeze(-1).expand_as(char_repr)  # shape: (B, S, char_cnn_dim)

# Aplicar a máscara para zerar posições de padding
char_repr_masked = char_repr * char_mask.float()

print("Antes da máscara:")
print("Max abs pad‑vals:", char_repr[~word_mask].abs().max().item())

print("\nDepois da máscara:")
print("Max abs pad‑vals:", char_repr_masked[~word_mask].abs().max().item())

# Atualizar char_repr
char_repr = char_repr_masked

Antes da máscara:
Max abs pad‑vals: 0.07796327769756317

Depois da máscara:
Max abs pad‑vals: 0.0


## Implementando o BiLSTM

In [22]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Recombinar com as representações mascaradas
combined = torch.cat([embedded_word_inputs, char_repr], dim=2)
print("combined.shape após mascaramento:", combined.shape)

# Parâmetros do BiLSTM
input_size = combined.size(-1)  # word_embed_dim + char_cnn_dim = 300 + 300 = 600
hidden_size = 256  # tamanho oculto do LSTM
num_layers = 2     # número de camadas
dropout = 0.3      # dropout entre camadas

# Criar a camada BiLSTM
bilstm = nn.LSTM(
    input_size=input_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    dropout=dropout,
    bidirectional=True,
    batch_first=True
)

print(f"BiLSTM configurado:")
print(f"  Input size: {input_size}")
print(f"  Hidden size: {hidden_size}")
print(f"  Num layers: {num_layers}")
print(f"  Bidirectional: True")
print(f"  Output size: {hidden_size * 2} (bidirectional)")

combined.shape após mascaramento: torch.Size([4061, 9, 600])
BiLSTM configurado:
  Input size: 600
  Hidden size: 256
  Num layers: 2
  Bidirectional: True
  Output size: 512 (bidirectional)


In [23]:
# Calcular comprimentos das sequências (sem padding)
lengths = torch.sum(word_inputs != 0, dim=1)
print("Distribuição dos comprimentos:", torch.bincount(lengths))

# Empacotar sequências para eficiência no LSTM
packed_input = pack_padded_sequence(
    combined,
    lengths.cpu(),
    batch_first=True,
    enforce_sorted=False
)

# Passar pelo BiLSTM
packed_output, (hidden, cell) = bilstm(packed_input)

# Desempacotar a saída
lstm_output, output_lengths = pad_packed_sequence(
    packed_output,
    batch_first=True,
    total_length=max_seq_len  # manter o tamanho original
)

print(f"LSTM output shape: {lstm_output.shape}")
print(f"Hidden state shape: {hidden.shape}")
print(f"Cell state shape: {cell.shape}")

Distribuição dos comprimentos: tensor([   0, 1052, 1437,  769,  186,  420,  196,    0,    0,    1])
LSTM output shape: torch.Size([4061, 9, 512])
Hidden state shape: torch.Size([4, 4061, 256])
Cell state shape: torch.Size([4, 4061, 256])


## Camadas de Saída e Pooling

In [None]:
import torch.nn.functional as F

# Diferentes estratégias de pooling
# 1. Mean pooling (ignorando padding)
def mean_pooling(sequence_output, lengths):
    batch_size, max_len, hidden_dim = sequence_output.size()

    # Criar máscara para ignorar padding
    mask = torch.arange(max_len).expand(batch_size, max_len) < lengths.unsqueeze(1)
    mask = mask.float().unsqueeze(-1).to(sequence_output.device)

    # Aplicar máscara e calcular média
    masked_output = sequence_output * mask
    pooled = masked_output.sum(dim=1) / lengths.unsqueeze(-1).float()

    return pooled

# 2. Max pooling
def max_pooling(sequence_output, lengths):
    batch_size, max_len, hidden_dim = sequence_output.size()

    # Criar máscara para ignorar padding
    mask = torch.arange(max_len).expand(batch_size, max_len) < lengths.unsqueeze(1)
    mask = mask.float().unsqueeze(-1).to(sequence_output.device)

    # Aplicar máscara (colocar -inf em posições de padding)
    masked_output = sequence_output.masked_fill(mask == 0, float('-inf'))
    pooled, _ = masked_output.max(dim=1)

    return pooled

# 3. Attention pooling (simples)
class AttentionPooling(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attention = nn.Linear(hidden_dim, 1)

    def forward(self, sequence_output, lengths):
        batch_size, max_len, hidden_dim = sequence_output.size()

        # Calcular scores de atenção
        attention_scores = self.attention(sequence_output).squeeze(-1)  # (B, S)

        # Criar máscara para ignorar padding
        mask = torch.arange(max_len).expand(batch_size, max_len) < lengths.unsqueeze(1)
        attention_scores = attention_scores.masked_fill(~mask, float('-inf'))

        # Softmax para obter pesos
        attention_weights = F.softmax(attention_scores, dim=1).unsqueeze(-1)  # (B, S, 1)

        # Weighted sum
        pooled = (sequence_output * attention_weights).sum(dim=1)  # (B, hidden_dim)

        return pooled, attention_weights.squeeze(-1)

# Testando diferentes poolings
print("=== TESTANDO POOLINGS ===")

# Mean pooling
mean_pooled = mean_pooling(lstm_output, lengths)
print(f"Mean pooled shape: {mean_pooled.shape}")

# Max pooling
max_pooled = max_pooling(lstm_output, lengths)
print(f"Max pooled shape: {max_pooled.shape}")

# Attention pooling
attention_pooler = AttentionPooling(lstm_output.size(-1))
att_pooled, att_weights = attention_pooler(lstm_output, lengths)
print(f"Attention pooled shape: {att_pooled.shape}")
print(f"Attention weights shape: {att_weights.shape}")

=== TESTANDO POOLINGS ===
Mean pooled shape: torch.Size([4061, 512])
Max pooled shape: torch.Size([4061, 512])
Attention pooled shape: torch.Size([4061, 512])
Attention weights shape: torch.Size([4061, 9])


## Camada Final de Classificação

In [25]:
# Parâmetros para classificação
num_classes = len(df['CALENDAR'].unique())  # número de calendários únicos
pooled_dim = hidden_size * 2  # BiLSTM output dimension
dropout_rate = 0.5

print(f"Número de classes (calendários): {num_classes}")
print(f"Dimensão após pooling: {pooled_dim}")

# Camada de classificação
classifier = nn.Sequential(
    nn.Dropout(dropout_rate),
    nn.Linear(pooled_dim, pooled_dim // 2),
    nn.ReLU(),
    nn.Dropout(dropout_rate),
    nn.Linear(pooled_dim // 2, num_classes)
)

# Testar a classificação com mean pooling
logits = classifier(mean_pooled)
print(f"Logits shape: {logits.shape}")

# Aplicar softmax para obter probabilidades
probabilities = F.softmax(logits, dim=1)
print(f"Probabilities shape: {probabilities.shape}")

# Mostrar algumas predições
print(f"\nPrimeiras 5 predições (índices das classes):")
predictions = torch.argmax(probabilities, dim=1)
print(predictions[:5])

print(f"\nPrimeiras 5 probabilidades máximas:")
max_probs, _ = torch.max(probabilities, dim=1)
print(max_probs[:5])

Número de classes (calendários): 11
Dimensão após pooling: 512
Logits shape: torch.Size([4061, 11])
Probabilities shape: torch.Size([4061, 11])

Primeiras 5 predições (índices das classes):
tensor([6, 6, 6, 6, 6])

Primeiras 5 probabilidades máximas:
tensor([0.0975, 0.0981, 0.0966, 0.0983, 0.0998], grad_fn=<SliceBackward0>)


## Resumo da Arquitetura Completa

A arquitetura implementada consiste em:

### 1. **Word Embeddings (FastText)**
- Embeddings pré-treinados de 300 dimensões
- Vocabulário: ~{num_words} palavras
- Palavras não encontradas no FastText são zeradas (compensadas pelo Char-CNN)

### 2. **Character-CNN**
- Embeddings de caracteres: 50 dimensões
- 3 filtros convolucionais: tamanhos 3, 4, 5 (100 filtros cada)
- Max pooling para capturar n-gramas mais importantes
- Representação final: 300 dimensões (100 + 100 + 100)
- **Correção aplicada**: Mascaramento de posições de padding

### 3. **Concatenação**
- Word embeddings (300) + Char-CNN (300) = 600 dimensões
- Cada token é representado por ambas as informações

### 4. **BiLSTM**
- 2 camadas bidirecionais
- Hidden size: 256 (512 total por ser bidirecional)
- Dropout: 0.3 entre camadas
- Packed sequences para eficiência

### 5. **Pooling**
- Mean pooling: média ponderada ignorando padding
- Max pooling: máximo ignorando padding  
- Attention pooling: atenção aprendida

### 6. **Classificação**
- Camada linear: 512 → 256 → {num_classes} classes
- Dropout: 0.5 para regularização
- Saída: probabilidades para cada calendário

Esta arquitetura é típica para **classificação de sequências** onde você precisa:
- Capturar semântica (word embeddings)
- Capturar morfologia (char-CNN)
- Modelar dependências temporais (BiLSTM)
- Fazer classificação final