In [None]:
from utils.tokenizer import CharTokenizer, WordTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import pandas as pd
import torch
import torch.optim as optim


data = pd.read_csv('./data/news_data.csv', encoding='ISO-8859-1', header=None, names=['label', 'title'])

text_paragraph = data['title'].str.cat(sep=' ')
tokenizer = WordTokenizer.train_from_text(text_paragraph) 

vocab_size = tokenizer.vocabulary_size()
seq_length = 100 # max. feature vector length for each title
batch_size = 32

# encode the input vetor (title)
data['tokenized'] = data['title'].apply(tokenizer.encode)
data['tokenized'] = data['tokenized'].apply(lambda x: x if len(x) > 0 else [0])
padded_titles = pad_sequence( [tokens.clone().detach() for tokens in data['tokenized']], batch_first=True, padding_value=0)
padded_titles = padded_titles[:, :seq_length] #keep max lenght at 50

# econde the label
label_mapping = {label: idx for idx, label in enumerate(data['label'].unique())}
data['label_encoded'] = data['label'].map(label_mapping)
labels = torch.tensor(data['label_encoded'].values)


train_titles, test_titles, train_labels, test_labels = train_test_split(padded_titles, labels, test_size=0.2, random_state=42)

train_dataset = TensorDataset(train_titles, train_labels)
test_dataset = TensorDataset(test_titles, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

train_iter = iter(train_loader)
batch_data, batch_labels = next(train_iter)

print(f"Data Batch Shape: {batch_data.shape}")
print(f"Label Batch Shape: {batch_labels.shape}")



In [5]:
class VanillaRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(VanillaRNN, self).__init__()
        
        # Embedding layer to convert token indices to dense vectors
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):       
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        output = self.fc(output[:, -1, :])
        
        return output


In [None]:
embedding_dim = 128
hidden_size = 64
output_size = 3
num_epochs = 5  

model = VanillaRNN(vocab_size, embedding_dim, hidden_size, output_size=output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)




for epoch in range(num_epochs):
    model.train()  
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    
    for batch_data, batch_labels in train_loader:
        optimizer.zero_grad()  
        outputs = model(batch_data)        

        loss = criterion(outputs, batch_labels)
        running_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == batch_labels).sum().item()
        total_predictions += batch_labels.size(0)
    
    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = correct_predictions / total_predictions
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

# Evaluation
model.eval()  
correct_predictions = 0
total_predictions = 0

with torch.no_grad():  
    for batch_data, batch_labels in test_loader:
        outputs = model(batch_data)
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == batch_labels).sum().item()
        total_predictions += batch_labels.size(0)

test_accuracy = correct_predictions / total_predictions
print(f"Test Accuracy: {test_accuracy*100:.2f}%")