In [None]:
from utils.tokenizer import CharTokenizer, WordTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import pandas as pd
import torch
import torch.optim as optim


data = pd.read_csv('./data/news_data.csv', encoding='ISO-8859-1', header=None, names=['label', 'title'])

text_paragraph = data['title'].str.cat(sep=' ')
tokenizer = WordTokenizer.train_from_text(text_paragraph) 

vocab_size = tokenizer.vocabulary_size()
seq_length = 100 # max. feature vector length for each title
batch_size = 32

# encode the input vetor (title)
data['tokenized'] = data['title'].apply(tokenizer.encode)
data['tokenized'] = data['tokenized'].apply(lambda x: x if len(x) > 0 else [0])
padded_titles = pad_sequence( [tokens.clone().detach() for tokens in data['tokenized']], batch_first=True, padding_value=0)
padded_titles = padded_titles[:, :seq_length] #keep max lenght at 50

# econde the label
label_mapping = {label: idx for idx, label in enumerate(data['label'].unique())}
data['label_encoded'] = data['label'].map(label_mapping)
labels = torch.tensor(data['label_encoded'].values)


train_titles, test_titles, train_labels, test_labels = train_test_split(padded_titles, labels, test_size=0.2, random_state=42)

train_dataset = TensorDataset(train_titles, train_labels)
test_dataset = TensorDataset(test_titles, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

train_iter = iter(train_loader)
batch_data, batch_labels = next(train_iter)

print(f"Data Batch Shape: {batch_data.shape}")
print(f"Label Batch Shape: {batch_labels.shape}")



In [10]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        gru_out, ht = self.gru(embedded)
        # Take the output of the last time step
        out = self.fc(ht[-1])  # Get the final hidden state
        return out

In [None]:
embedding_dim = 128
hidden_size = 64
output_size = len(label_mapping)

# Initialize the model, criterion, and optimizer
model = GRUModel(vocab_size, embedding_dim, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_data, batch_labels in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
       
        outputs = model(batch_data)
        
        # Compute loss
        loss = criterion(outputs, batch_labels)
        total_loss += loss.item()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")

# Test loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch_data, batch_labels in test_loader:
        outputs = model(batch_data)
        _, predicted = torch.max(outputs, 1)
        total += batch_labels.size(0)
        correct += (predicted == batch_labels).sum().item()

accuracy = 100 * correct / total
print(f"Accuracy: {accuracy:.2f}%")