In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("FolhaArticles.csv", sep='\t')

In [3]:
df.dropna(inplace=True)

In [4]:
categories = [ 'celebridades', 'ciencia', 'cinema', 'comida', 'educacao', 'eleicoes', 'esporte', 'mercado', 'poder']
df = df[df['categories'].isin(categories)]

In [5]:
df['Category'] = [categories.index(x) for x in df.categories]

In [6]:
df[['Category', 'Content']].shape

(64400, 2)

In [7]:
df_train = df[['Category', 'Content']][:54000]
df_test = df[['Category', 'Content']][54000:]

In [8]:
df_train = df_train.values.tolist()
df_test = df_test.values.tolist()

In [9]:
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

def build_vocab(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

vocab = build_vocab_from_iterator(build_vocab([df_train, df_test]), specials=["<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectorizer = CountVectorizer(vocabulary=vocab.get_itos(), tokenizer=tokenizer)
def vectorize_batch(batch):
    Y, X = list(zip(*batch)) # agrupa labels em uma tupla e textos em outra tupla (exemplo abaixo)
    X = vectorizer.transform(X).todense()
    return torch.tensor(X, dtype=torch.float32).cuda(), torch.tensor(Y).cuda()


In [11]:
from torch import nn
from torch.nn import functional as F

class TextClassifier(nn.Module):
    def __init__(self):
        super(TextClassifier, self).__init__()
        self.seq = nn.Sequential(
            nn.Linear(len(vocab), 128),
            nn.ReLU(),

            nn.Linear(128, 64),
            nn.ReLU(),

            nn.Linear(64, len(categories)),
            #nn.ReLU(),

            #nn.Linear(64, 4),
        )

    def forward(self, X_batch):
        return self.seq(X_batch)

In [12]:

import gc
def MakePredictions(model, loader):
    Y_shuffled, Y_preds = [], []
    for X, Y in loader:
        preds = model(X)
        Y_preds.append(preds)
        Y_shuffled.append(Y)
    gc.collect()
    Y_preds, Y_shuffled = torch.cat(Y_preds).cpu(), torch.cat(Y_shuffled).cpu()

    return Y_shuffled.detach().numpy(), F.softmax(Y_preds, dim=-1).argmax(dim=-1).detach().numpy()


In [13]:
import torch
model = torch.load('TextClassifierModel.pt')

In [14]:
from torch.utils.data import DataLoader

MakePredictions(model, DataLoader([[7, 'O presidente Luiz Inácio Lula da Silva (PT), o ministro Fernando Haddad (Fazenda) e o vice-presidente Geraldo Alckmin disseram, nesta sexta-feira (1º), que o governo foi surpreendido positivamente com o crescimento da economia em 2023 após o resultado do PIB (Produto Interno Bruto) superar as expectativas do início do mandato.']], batch_size=256, collate_fn=vectorize_batch))

(array([7], dtype=int64), array([7], dtype=int64))