In [12]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('FolhaArticles.csv', sep='\t')
df = df.head(100)
df.head()

Unnamed: 0,Title,Content,Url,Published,categories
0,Visões da batalha na Rússia e no Ocidente são ...,Quatro décadas de Guerra Fria e o renovado con...,https://www1.folha.uol.com.br/mundo/2019/06/vi...,2019-06-06 02:00:00.0000000,mundo
1,Bolsonaro é intimado pela PF sobre golpe e pod...,A Polícia Federal intimou Jair Bolsonaro (PL) ...,https://www1.folha.uol.com.br/poder/2024/02/pf...,2024-02-19 12:45:00.0000000,poder
2,Hayao Miyazaki passa a vida a limpo em O Menin...,Como vocês vivem? A pergunta parece ser feita ...,https://www1.folha.uol.com.br/ilustrada/2024/0...,2024-02-19 07:00:00.0000000,ilustrada
3,Série produzida pelo Estúdio Folha ganha Prêmi...,"A série ""Caminhos Proibidos"", produzida pelo E...",https://estudio.folha.uol.com.br/estudio/2024/...,2024-01-18 16:49:00.0000000,estudio
4,"Como é 'Crystal', do Cirque du Soleil, que vem...",Uma mulher expressa sua angústia dançando sobr...,https://www1.folha.uol.com.br/ilustrada/2024/0...,2024-02-19 15:15:00.0000000,ilustrada


In [4]:


tokenizer = get_tokenizer(None, language='pt')

In [5]:
df['tokenized'] = [tokenizer(str(txt)) for txt in df.Content]



In [14]:
# Build the vocabulary
def build_vocab(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        tokens = tokenizer(f.read())
    return tokens

vocab = build_vocab_from_iterator(df.tokenized)

# vocab = tokenizer(df.Content.str.cat())

In [15]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        last_hidden = output[:, -1, :]
        logits = self.fc(last_hidden)
        return logits

In [16]:
class FolhaContentDataset(Dataset):
    def __init__(self, df, vocab):
        self.data = df.tokenized
        self.vocab = vocab

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        example = self.data[idx]
        numerical_tokens = [self.vocab[token] for token in example]
        return numerical_tokens



In [17]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        last_hidden = output[:, -1, :]
        logits = self.fc(last_hidden)
        return logits

In [18]:
cat = []

for i in range(0, len(df.categories.sort_values().unique().tolist())):
    cat.append(i)

In [19]:
# Training parameters
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 128
num_classes = 10
batch_size = 32
num_epochs = 10
learning_rate = 0.001


# Create the model
model = TextClassifier(vocab_size, embedding_dim, hidden_dim, num_classes)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Create data loaders for the training and validation sets
train_dataset = FolhaContentDataset(df, vocab)
valid_dataset = FolhaContentDataset(df, vocab)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

# Iterate over the training data for the specified number of epochs
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_samples = 0
    for inputs in train_loader:
        optimizer.zero_grad()
        inputs = torch.LongTensor(inputs)
        targets = inputs.clone()
        outputs = model(inputs)
        print(type(outputs))

        # loss = criterion(outputs.view(-1, num_classes), targets.view(-1))
        loss = criterion(outputs, torch.tensor(cat, dtype=torch.long));
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(inputs)
        total_samples += len(inputs)

    # Evaluate on the validation set after every epoch
    model.eval()
    total_val_loss = 0.0
    total_val_samples = 0
    with torch.no_grad():
        for inputs in valid_loader:
            inputs = torch.LongTensor(inputs)
            targets = inputs.clone()
            outputs = model(inputs)
            val_loss = criterion(outputs.view(-1, num_classes), targets.view(-1))

            total_val_loss += val_loss.item() * len(inputs)
            total_val_samples += len(inputs)

    avg_loss = total_loss / total_samples
    avg_val_loss = total_val_loss / total_val_samples

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

RuntimeError: each element in list of batch should be of equal size