In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from gensim.models import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/data/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
word_vectors.key_to_index = {'UNK': 0, **word_vectors.key_to_index}

In [None]:
import pandas as pd

train_data = pd.read_csv(f'/content/drive/MyDrive/data/train.txt', header=None, sep='\t')
train_data.columns = ['title', 'category']

val_data = pd.read_csv(f'/content/drive/MyDrive/data/val.txt', header=None, sep='\t')
val_data.columns = ['title', 'category']

test_data = pd.read_csv(f'/content/drive/MyDrive/data/test.txt', header=None, sep='\t')
test_data.columns = ['title', 'category']

In [None]:
train_data['title'] = train_data['title'].str.split()
val_data['title'] = val_data['title'].str.split()
test_data['title'] = test_data['title'].str.split()

In [None]:
X_train, y_train = train_data['title'].values, train_data['category'].values
X_val, y_val = val_data['title'].values, val_data['category'].values
X_test, y_test = test_data['title'].values, test_data['category'].values

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

class NewsDataset(Dataset):
    def __init__(self, X, y, word_to_index: dict):
        super(NewsDataset, self).__init__()
        self.X = X
        self.y = y
        self.word_to_index = word_to_index

    def __getitem__(self, idx):
        title = self.X[idx]
        title_indices = [self.word_to_index.get(word, 0) for word in title]

        return torch.tensor(title_indices, dtype=torch.long), torch.tensor([self.y[idx]], dtype=torch.long)

    def __len__(self):
        return len(self.y)

In [None]:
def pad_sequence(data, max_length):
    padded_data = []
    for sample in data:
        if len(sample) < max_length:
            padded_sample = sample + [0] * (max_length - len(sample))  # Padding with zeros
            padded_data.append(padded_sample)
        else:
            padded_data.append(sample[:max_length])  # Truncate if longer than max_length
    return padded_data

# Find the maximum length among all samples in the datasets
max_length = max(max(len(sample) for sample in X_train),
                 max(len(sample) for sample in X_val),
                 max(len(sample) for sample in X_test))

X_train_padded = pad_sequence(X_train, max_length)
X_val_padded = pad_sequence(X_val, max_length)
X_test_padded = pad_sequence(X_test, max_length)

train_dataset = NewsDataset(X_train_padded, y_train, word_vectors.key_to_index)
val_dataset = NewsDataset(X_val_padded, y_val, word_vectors.key_to_index)
test_dataset = NewsDataset(X_test_padded, y_test, word_vectors.key_to_index)

train_dataloader = DataLoader(train_dataset, batch_size=100, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=100, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=100, shuffle=True)

In [None]:
from torch.nn import Module

class SingleLayerNN(Module):
    def __init__(self, vocab_size : int, embedding_dim : int, output_dim : int, pretrained_word2vec : None):
        super(SingleLayerNN, self).__init__()
        if pretrained_word2vec != None:
            weights = torch.FloatTensor(pretrained_word2vec.vectors)
            self.embedding = nn.Embedding.from_pretrained(weights)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=1)

        return self.fc(x)

In [None]:
vocab_size = len(word_vectors.index_to_key)
embedding_dim = 300
output_dim = 4
model = SingleLayerNN(vocab_size, embedding_dim, output_dim, word_vectors)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)


In [None]:
num_epochs = 1000
losses = []
accuracies = []
checkpoint_path = '/content/drive/MyDrive/data/model.pt'

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    total_correct = 0
    total_samples = 0
    for inputs, labels in train_dataloader:
        labels = labels.view(-1).type(torch.long)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        losses.append(loss)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        total_correct += (outputs.argmax(1) == labels).sum().item()
        total_samples += labels.size(0)

    epoch_loss = running_loss / len(train_dataloader)
    epoch_accuracy = total_correct / total_samples

    losses.append(epoch_loss)
    accuracies.append(epoch_accuracy)

    # Print and plot loss and accuracy after each epoch
    print(f'Epoch {epoch + 1}, Loss: {epoch_loss}, Accuracy: {epoch_accuracy}')

    # Save checkpoints
    # checkpoint = {
    #     'epoch': epoch,
    #     'model_state_dict': model.state_dict(),
    #     'optimizer_state_dict': optimizer.state_dict(),
    #     'loss': loss
    # }
    # torch.save(checkpoint, checkpoint_path)

Epoch 1, Loss: 1.316219207218715, Accuracy: 0.4884519946554686
Epoch 2, Loss: 1.2158273424421038, Accuracy: 0.5874212635999236
Epoch 3, Loss: 1.163972172282991, Accuracy: 0.6380988738308837
Epoch 4, Loss: 1.1345634687514532, Accuracy: 0.667398358465356
Epoch 5, Loss: 1.115523614202227, Accuracy: 0.6997518610421837
Epoch 6, Loss: 1.100553706146422, Accuracy: 0.7049055163199084
Epoch 7, Loss: 1.0880420622371492, Accuracy: 0.7129223134185914
Epoch 8, Loss: 1.0762861138298399, Accuracy: 0.7221798053063562
Epoch 9, Loss: 1.065067043758574, Accuracy: 0.7338232487115862
Epoch 10, Loss: 1.054884770370665, Accuracy: 0.7330597442259973
Epoch 11, Loss: 1.0444533251580739, Accuracy: 0.7355411338041611
Epoch 12, Loss: 1.034480265208653, Accuracy: 0.7383088375644207
Epoch 13, Loss: 1.0249094877924239, Accuracy: 0.741362855506776
Epoch 14, Loss: 1.0154406388600667, Accuracy: 0.7420309219316663
Epoch 15, Loss: 1.0064338825997852, Accuracy: 0.7418400458102692
Epoch 16, Loss: 0.996932974315825, Accuracy

In [None]:
import matplotlib.pyplot as plt

def plot_loss_accuracy(losses, accuracies):
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(losses, label='Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss')

    plt.subplot(1, 2, 2)
    plt.plot(accuracies, label='Training Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Training Accuracy')

    plt.show()
