In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load dataset from Kaggle train.csv and test.csv
def load_data():
    train_data = pd.read_csv('train.csv')
    test_data = pd.read_csv('test.csv')

    # Use both Title and Description as input features
    train_texts = train_data['Title'] + ' ' + train_data['Description']
    test_texts = test_data['Title'] + ' ' + test_data['Description']

    # Convert class index (1-4) to zero-based (0-3)
    train_labels = train_data['Class Index'] - 1
    test_labels = test_data['Class Index'] - 1

    return train_texts, train_labels, test_texts, test_labels

# Load data
train_texts, train_labels, test_texts, test_labels = load_data()

# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts).toarray()
X_test = vectorizer.transform(test_texts).toarray()
y_train = np.array(train_labels)
y_test = np.array(test_labels)

# Convert to PyTorch tensors
X_train, X_test = torch.tensor(X_train, dtype=torch.float32), torch.tensor(X_test, dtype=torch.float32)
y_train, y_test = torch.tensor(y_train, dtype=torch.long), torch.tensor(y_test, dtype=torch.long)

# PyTorch Dataset
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Dataloaders
train_loader = DataLoader(NewsDataset(X_train, y_train), batch_size=64, shuffle=True)
test_loader = DataLoader(NewsDataset(X_test, y_test), batch_size=64, shuffle=False)

# Define MLP model
class MLPClassifier(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_classes=4):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size // 2)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size // 2, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Initialize model
input_size = X_train.shape[1]  # Number of TF-IDF features
model = MLPClassifier(input_size)

# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

# Train the model
train_model(model, train_loader)

# Evaluate the model
def evaluate_model(model, test_loader):
    model.eval()
    #correct = 0
    #total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())
            #total += y_batch.size(0)
            #correct += (predicted == y_batch).sum().item()
    #print(f"Test Accuracy: {100 * correct / total:.2f}%")
    # Compute accuracy
    accuracy = accuracy_score(all_labels, all_preds)

    # Compute precision, recall, and F1-score
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    print(f"Test Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")


evaluate_model(model, test_loader)


Epoch 1, Loss: 0.3620
Epoch 2, Loss: 0.2285
Epoch 3, Loss: 0.2028
Epoch 4, Loss: 0.1828
Epoch 5, Loss: 0.1654
Epoch 6, Loss: 0.1487
Epoch 7, Loss: 0.1327
Epoch 8, Loss: 0.1169
Epoch 9, Loss: 0.1010
Epoch 10, Loss: 0.0861
Test Accuracy: 90.09%
Precision: 0.9010
Recall: 0.9009
F1-score: 0.9010
