In [1]:
# load dataset data/test.csv and data/train.csv

import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print(train.shape, test.shape)

train.head()

(120000, 2) (7600, 2)


Unnamed: 0,text,label
0,wall st bear claw back black reuters reuters s...,2
1,carlyle look toward commercial aerospace reute...,2
2,oil economy cloud stock outlook reuters reuter...,2
3,iraq halt oil export main southern pipeline re...,2
4,oil price soar alltime record posing new menac...,2


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text data
tfidf = TfidfVectorizer(stop_words='english')

# Learn vocabulary from training texts and vectorize training texts.
tfidf.fit(train.text)

# transform the training data
train_tfidf = tfidf.transform(train.text)

# transform the test data
test_tfidf = tfidf.transform(test.text)

print(train_tfidf.shape, test_tfidf.shape)

(120000, 77346) (7600, 77346)


In [3]:
# simply use the vectorized data as input to the model
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(train_tfidf, train.label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [4]:
#  predict and evaluate the model
from sklearn.metrics import classification_report

# predict on the test set
predictions = classifier.predict(test_tfidf)

# calculate the classification report
print(classification_report(test.label, predictions, digits=4))

              precision    recall  f1-score   support

           0     0.9353    0.9053    0.9200      1900
           1     0.9563    0.9800    0.9680      1900
           2     0.8780    0.8789    0.8785      1900
           3     0.8844    0.8900    0.8872      1900

    accuracy                         0.9136      7600
   macro avg     0.9135    0.9136    0.9134      7600
weighted avg     0.9135    0.9136    0.9134      7600



In [5]:
# svm classifier
from sklearn.svm import LinearSVC

classifier = LinearSVC()
classifier.fit(train_tfidf, train.label)



In [7]:
# predict on the test set
predictions = classifier.predict(test_tfidf)

# calculate the classification report
print(classification_report(test.label, predictions, digits=4))

              precision    recall  f1-score   support

           0     0.9368    0.9047    0.9205      1900
           1     0.9573    0.9800    0.9685      1900
           2     0.8864    0.8874    0.8869      1900
           3     0.8900    0.8984    0.8942      1900

    accuracy                         0.9176      7600
   macro avg     0.9176    0.9176    0.9175      7600
weighted avg     0.9176    0.9176    0.9175      7600



In [8]:
# naive bayes classifier
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(train_tfidf, train.label)

In [9]:
# predict on the test set
predictions = classifier.predict(test_tfidf)

# calculate the classification report
print(classification_report(test.label, predictions, digits=4))

              precision    recall  f1-score   support

           0     0.9150    0.8947    0.9047      1900
           1     0.9508    0.9763    0.9634      1900
           2     0.8590    0.8626    0.8608      1900
           3     0.8800    0.8721    0.8760      1900

    accuracy                         0.9014      7600
   macro avg     0.9012    0.9014    0.9012      7600
weighted avg     0.9012    0.9014    0.9012      7600



In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '4'

# Convert sparse matrix to PyTorch tensor
def sparse_to_tensor(sparse_matrix):
    sparse_matrix = sparse_matrix.tocoo()
    values = sparse_matrix.data
    indices = np.vstack((sparse_matrix.row, sparse_matrix.col))
    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = sparse_matrix.shape
    return torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense()

train_features = sparse_to_tensor(train_tfidf)
train_labels = torch.LongTensor(train.label)

test_features = sparse_to_tensor(test_tfidf)
test_labels = torch.LongTensor(test.label)

class MyDataset(Dataset):
    def __init__(self, feature_vectors, labels):
        self.labels = labels
        self.feature_vectors = feature_vectors

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.feature_vectors[idx], self.labels[idx]

# Create dataset objects
train_dataset = MyDataset(train_features, train_labels)
test_dataset = MyDataset(test_features, test_labels) 

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# Model definition
class Model(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Hyperparameters
input_dim = len(tfidf.get_feature_names_out())
hidden_dim = 32
output_dim = 4  # Assuming 4 classes
epochs = 10

# Model instantiation
model = Model(input_dim, hidden_dim, output_dim)
# Move model to GPU if CUDA is available
model.cuda()

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
for epoch in range(epochs):
    model.train()
    for i, (features, labels) in tqdm(enumerate(train_loader)):
        # move tensors to GPU if CUDA is available
        features = features.cuda()
        labels = labels.cuda()
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

    # Evaluation
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for features, labels in tqdm(test_loader):
            # move tensors to GPU if CUDA is available
            features = features.cuda()
            labels = labels.cuda()
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        accuracy = 100 * correct / total
        print(f'Epoch [{epoch+1}/{epochs}], Accuxracy: {accuracy:.2f}%')

print("Training complete")

1it [00:01,  1.88s/it]

Epoch [1/10], Step [1/938], Loss: 1.3839


101it [03:02,  1.78s/it]

Epoch [1/10], Step [101/938], Loss: 1.3796


201it [06:06,  1.82s/it]

Epoch [1/10], Step [201/938], Loss: 1.3472


301it [09:29,  1.83s/it]

Epoch [1/10], Step [301/938], Loss: 1.3194


401it [12:30,  1.79s/it]

Epoch [1/10], Step [401/938], Loss: 1.2887


501it [15:33,  1.83s/it]

Epoch [1/10], Step [501/938], Loss: 1.2297


601it [18:41,  1.82s/it]

Epoch [1/10], Step [601/938], Loss: 1.1839


701it [21:50,  1.81s/it]

Epoch [1/10], Step [701/938], Loss: 1.1755


801it [24:52,  1.84s/it]

Epoch [1/10], Step [801/938], Loss: 1.0921


901it [27:55,  1.83s/it]

Epoch [1/10], Step [901/938], Loss: 1.0383


938it [29:01,  1.86s/it]
100%|██████████| 60/60 [01:54<00:00,  1.91s/it]


Epoch [1/10], Accuxracy: 88.80%


1it [00:01,  1.78s/it]

Epoch [2/10], Step [1/938], Loss: 0.9903


101it [03:09,  1.81s/it]

Epoch [2/10], Step [101/938], Loss: 0.9254


201it [06:16,  1.80s/it]

Epoch [2/10], Step [201/938], Loss: 0.8942


301it [09:25,  2.60s/it]

Epoch [2/10], Step [301/938], Loss: 0.8448


401it [12:27,  1.80s/it]

Epoch [2/10], Step [401/938], Loss: 0.7900


501it [15:28,  1.82s/it]

Epoch [2/10], Step [501/938], Loss: 0.7752


601it [18:30,  1.80s/it]

Epoch [2/10], Step [601/938], Loss: 0.7500


701it [21:32,  1.83s/it]

Epoch [2/10], Step [701/938], Loss: 0.6158


801it [24:33,  1.79s/it]

Epoch [2/10], Step [801/938], Loss: 0.6375


901it [27:35,  1.82s/it]

Epoch [2/10], Step [901/938], Loss: 0.5933


938it [28:41,  1.84s/it]
100%|██████████| 60/60 [01:47<00:00,  1.79s/it]


Epoch [2/10], Accuxracy: 89.53%


1it [00:01,  1.98s/it]

Epoch [3/10], Step [1/938], Loss: 0.5598


101it [03:04,  1.83s/it]

Epoch [3/10], Step [101/938], Loss: 0.4997


201it [06:07,  1.80s/it]

Epoch [3/10], Step [201/938], Loss: 0.5862


301it [09:10,  1.82s/it]

Epoch [3/10], Step [301/938], Loss: 0.5458


401it [12:22,  1.92s/it]

Epoch [3/10], Step [401/938], Loss: 0.5148


501it [15:45,  1.84s/it]

Epoch [3/10], Step [501/938], Loss: 0.4616


601it [18:56,  1.89s/it]

Epoch [3/10], Step [601/938], Loss: 0.5621


701it [22:45,  1.83s/it]

Epoch [3/10], Step [701/938], Loss: 0.4363


801it [25:55,  1.86s/it]

Epoch [3/10], Step [801/938], Loss: 0.4275


901it [29:02,  1.83s/it]

Epoch [3/10], Step [901/938], Loss: 0.3953


938it [30:09,  1.93s/it]
100%|██████████| 60/60 [01:48<00:00,  1.81s/it]


Epoch [3/10], Accuxracy: 90.09%


1it [00:01,  1.82s/it]

Epoch [4/10], Step [1/938], Loss: 0.3451


101it [03:05,  1.84s/it]

Epoch [4/10], Step [101/938], Loss: 0.3873


201it [06:10,  1.83s/it]

Epoch [4/10], Step [201/938], Loss: 0.3695


301it [09:13,  1.81s/it]

Epoch [4/10], Step [301/938], Loss: 0.3849


401it [12:16,  1.81s/it]

Epoch [4/10], Step [401/938], Loss: 0.4046


501it [15:20,  1.89s/it]

Epoch [4/10], Step [501/938], Loss: 0.3904


601it [18:31,  1.86s/it]

Epoch [4/10], Step [601/938], Loss: 0.3086


701it [21:53,  1.94s/it]

Epoch [4/10], Step [701/938], Loss: 0.3524


801it [24:55,  1.80s/it]

Epoch [4/10], Step [801/938], Loss: 0.3630


901it [27:58,  1.84s/it]

Epoch [4/10], Step [901/938], Loss: 0.3373


938it [29:05,  1.86s/it]
100%|██████████| 60/60 [01:49<00:00,  1.82s/it]


Epoch [4/10], Accuxracy: 90.38%


1it [00:01,  1.91s/it]

Epoch [5/10], Step [1/938], Loss: 0.3345


101it [03:05,  1.85s/it]

Epoch [5/10], Step [101/938], Loss: 0.3325


201it [06:08,  1.83s/it]

Epoch [5/10], Step [201/938], Loss: 0.2542


301it [09:11,  1.82s/it]

Epoch [5/10], Step [301/938], Loss: 0.2574


401it [12:13,  1.79s/it]

Epoch [5/10], Step [401/938], Loss: 0.2997


501it [15:15,  1.81s/it]

Epoch [5/10], Step [501/938], Loss: 0.2975


601it [18:16,  1.81s/it]

Epoch [5/10], Step [601/938], Loss: 0.2301


701it [21:19,  1.79s/it]

Epoch [5/10], Step [701/938], Loss: 0.3221


801it [24:20,  1.84s/it]

Epoch [5/10], Step [801/938], Loss: 0.3257


901it [27:22,  1.81s/it]

Epoch [5/10], Step [901/938], Loss: 0.4093


938it [28:29,  1.82s/it]
100%|██████████| 60/60 [01:47<00:00,  1.80s/it]


Epoch [5/10], Accuxracy: 90.70%


1it [00:01,  1.87s/it]

Epoch [6/10], Step [1/938], Loss: 0.2549


101it [03:05,  1.88s/it]

Epoch [6/10], Step [101/938], Loss: 0.3088


201it [06:06,  1.80s/it]

Epoch [6/10], Step [201/938], Loss: 0.2980


301it [09:09,  1.81s/it]

Epoch [6/10], Step [301/938], Loss: 0.2812


401it [12:12,  1.83s/it]

Epoch [6/10], Step [401/938], Loss: 0.2483


501it [15:16,  1.88s/it]

Epoch [6/10], Step [501/938], Loss: 0.2566


601it [18:28,  1.86s/it]

Epoch [6/10], Step [601/938], Loss: 0.2941


701it [21:39,  1.94s/it]

Epoch [6/10], Step [701/938], Loss: 0.2785


801it [24:56,  1.82s/it]

Epoch [6/10], Step [801/938], Loss: 0.1927


901it [28:11,  2.03s/it]

Epoch [6/10], Step [901/938], Loss: 0.1549


938it [29:19,  1.88s/it]
100%|██████████| 60/60 [01:52<00:00,  1.87s/it]


Epoch [6/10], Accuxracy: 90.97%


1it [00:02,  2.09s/it]

Epoch [7/10], Step [1/938], Loss: 0.2738


101it [03:13,  1.89s/it]

Epoch [7/10], Step [101/938], Loss: 0.2632


201it [06:32,  1.93s/it]

Epoch [7/10], Step [201/938], Loss: 0.3226


301it [09:41,  1.87s/it]

Epoch [7/10], Step [301/938], Loss: 0.3669


401it [12:53,  2.19s/it]

Epoch [7/10], Step [401/938], Loss: 0.1622


501it [16:13,  1.84s/it]

Epoch [7/10], Step [501/938], Loss: 0.1991


601it [19:24,  1.85s/it]

Epoch [7/10], Step [601/938], Loss: 0.1795


701it [22:34,  1.84s/it]

Epoch [7/10], Step [701/938], Loss: 0.3074


801it [25:41,  1.81s/it]

Epoch [7/10], Step [801/938], Loss: 0.2964


901it [28:53,  1.80s/it]

Epoch [7/10], Step [901/938], Loss: 0.2232


938it [30:00,  1.92s/it]
100%|██████████| 60/60 [01:52<00:00,  1.88s/it]


Epoch [7/10], Accuxracy: 91.16%


1it [00:01,  1.83s/it]

Epoch [8/10], Step [1/938], Loss: 0.2756


101it [03:11,  1.92s/it]

Epoch [8/10], Step [101/938], Loss: 0.2271


201it [06:24,  1.81s/it]

Epoch [8/10], Step [201/938], Loss: 0.3532


301it [09:35,  1.83s/it]

Epoch [8/10], Step [301/938], Loss: 0.2844


401it [12:49,  1.82s/it]

Epoch [8/10], Step [401/938], Loss: 0.3347


501it [16:01,  2.14s/it]

Epoch [8/10], Step [501/938], Loss: 0.2430


601it [19:09,  1.87s/it]

Epoch [8/10], Step [601/938], Loss: 0.2090


701it [22:18,  1.91s/it]

Epoch [8/10], Step [701/938], Loss: 0.2801


801it [25:26,  1.87s/it]

Epoch [8/10], Step [801/938], Loss: 0.1834


901it [28:41,  1.83s/it]

Epoch [8/10], Step [901/938], Loss: 0.1878


938it [29:49,  1.91s/it]
100%|██████████| 60/60 [01:54<00:00,  1.90s/it]


Epoch [8/10], Accuxracy: 91.32%


1it [00:01,  1.83s/it]

Epoch [9/10], Step [1/938], Loss: 0.2330


101it [03:16,  1.85s/it]

Epoch [9/10], Step [101/938], Loss: 0.1637


201it [06:27,  1.82s/it]

Epoch [9/10], Step [201/938], Loss: 0.1829


301it [09:40,  1.79s/it]

Epoch [9/10], Step [301/938], Loss: 0.1997


401it [12:50,  2.01s/it]

Epoch [9/10], Step [401/938], Loss: 0.1731


501it [15:59,  1.91s/it]

Epoch [9/10], Step [501/938], Loss: 0.2269


601it [19:11,  1.96s/it]

Epoch [9/10], Step [601/938], Loss: 0.2104


701it [22:23,  1.84s/it]

Epoch [9/10], Step [701/938], Loss: 0.2674


801it [25:36,  2.25s/it]

Epoch [9/10], Step [801/938], Loss: 0.2084


901it [28:42,  1.78s/it]

Epoch [9/10], Step [901/938], Loss: 0.1490


938it [29:51,  1.91s/it]
100%|██████████| 60/60 [01:53<00:00,  1.89s/it]


Epoch [9/10], Accuxracy: 91.58%


1it [00:01,  1.79s/it]

Epoch [10/10], Step [1/938], Loss: 0.2219


101it [03:08,  1.85s/it]

Epoch [10/10], Step [101/938], Loss: 0.2062


201it [06:15,  1.79s/it]

Epoch [10/10], Step [201/938], Loss: 0.1736


301it [09:23,  1.81s/it]

Epoch [10/10], Step [301/938], Loss: 0.2522


401it [12:39,  2.16s/it]

Epoch [10/10], Step [401/938], Loss: 0.1373


501it [15:44,  1.81s/it]

Epoch [10/10], Step [501/938], Loss: 0.2020


601it [18:53,  1.84s/it]

Epoch [10/10], Step [601/938], Loss: 0.1396


701it [21:56,  1.83s/it]

Epoch [10/10], Step [701/938], Loss: 0.2342


801it [25:05,  1.86s/it]

Epoch [10/10], Step [801/938], Loss: 0.1901


901it [28:17,  2.07s/it]

Epoch [10/10], Step [901/938], Loss: 0.1652


938it [29:24,  1.88s/it]
100%|██████████| 60/60 [01:53<00:00,  1.89s/it]

Epoch [10/10], Accuxracy: 91.72%
Training complete





In [None]:
# predict on the test set
predictions = classifier.predict(test_tfidf)

# calculate the classification report
print(classification_report(test.label, predictions))