In [21]:
# Install any necessary dependencies
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [22]:
# 1. Load Data and Features

def load_data():
    train_df = pd.read_csv('/content/FINALtrainUpdated.csv')
    test_df = pd.read_csv('/content/FINALtestUpdated.csv')

    # Combine Title and Description for TF-IDF
    train_text = train_df['Title'] + ' ' + train_df['Description']
    test_text = test_df['Title'] + ' ' + test_df['Description']

    # Labels: convert from 1-4 to 0-3
    train_labels = train_df['Class Index'] - 1
    test_labels = test_df['Class Index'] - 1

    # Select numerical features (excluding TFIDFTopWords)
    feature_cols = [
        'DescNamedEntityCount', 'TitleDescSimilarity', 'AvgTitleWordLength',
        'TitleWordCount', 'TitleCharCount', 'DescAvgWordLength', 'DescWordCount',
        'DescCharCount', 'TitleNegSentiment', 'TitleNeuSentiment', 'TitlePosSentiment',
        'TitleCompoundSentiment', 'DescNegSentiment', 'DescNeuSentiment',
        'DescPosSentiment', 'DescCompoundSentiment', 'UppercaseWordCount',
        'ZeroShotScoreBusiness', 'ZeroShotScoreWorld', 'ZeroShotScoreSci/Tech', 'ZeroShotScoreSports'
    ]

    train_feats = train_df[feature_cols].values
    test_feats = test_df[feature_cols].values

    return train_text, train_feats, train_labels, test_text, test_feats, test_labels

train_texts, train_feats, train_labels, test_texts, test_feats, test_labels = load_data()

In [23]:
# 2. TF-IDF Vectorization

vectorizer = TfidfVectorizer(max_features=1500) #reduced max features
X_train_tfidf = vectorizer.fit_transform(train_texts).toarray()
X_test_tfidf = vectorizer.transform(test_texts).toarray()



In [24]:
# 3. Combine Features

X_train_combined = np.concatenate([X_train_tfidf, train_feats], axis=1)
X_test_combined = np.concatenate([X_test_tfidf, test_feats], axis=1)

# Convert to tensors
X_train_tensor = torch.tensor(X_train_combined, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_combined, dtype=torch.float32)
y_train_tensor = torch.tensor(train_labels.values, dtype=torch.long)
y_test_tensor = torch.tensor(test_labels.values, dtype=torch.long)

In [25]:
# 4. Create PyTorch Dataset

class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(NewsDataset(X_train_tensor, y_train_tensor), batch_size=64, shuffle=True)
test_loader = DataLoader(NewsDataset(X_test_tensor, y_test_tensor), batch_size=64, shuffle=False)

In [26]:
# 5. Define MLP Model

class MLPClassifier(nn.Module):
    def __init__(self, input_size, hidden_size=512, num_classes=4):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size // 2, num_classes)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.dropout(x)
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)
        return x

input_size = X_train_tensor.shape[1]
model = MLPClassifier(input_size)

In [27]:
# 6. Train the Model

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

def train_model(model, train_loader, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

train_model(model, train_loader)

Epoch 1, Loss: 0.5073
Epoch 2, Loss: 0.3255
Epoch 3, Loss: 0.3104
Epoch 4, Loss: 0.2983
Epoch 5, Loss: 0.2895
Epoch 6, Loss: 0.2826
Epoch 7, Loss: 0.2781
Epoch 8, Loss: 0.2750
Epoch 9, Loss: 0.2725
Epoch 10, Loss: 0.2670
Epoch 11, Loss: 0.2642
Epoch 12, Loss: 0.2642
Epoch 13, Loss: 0.2580
Epoch 14, Loss: 0.2573
Epoch 15, Loss: 0.2538
Epoch 16, Loss: 0.2515
Epoch 17, Loss: 0.2535
Epoch 18, Loss: 0.2512
Epoch 19, Loss: 0.2440
Epoch 20, Loss: 0.2430


In [28]:
# 7. Evaluate the Model

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    print(f"\nTest Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

evaluate_model(model, test_loader)


Test Accuracy: 90.29%
Precision: 0.9023
Recall: 0.9029
F1-score: 0.9023
