In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install torchmetrics



In [4]:
import pandas as pd
from collections import Counter
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as f
import torch.optim as optim
import torch.nn as nn
from torchmetrics import Accuracy

train_data = pd.read_csv(r'/content/drive/MyDrive/data/train.txt', header=None, sep='\t')
train_data.columns = ['title', 'category']

val_data = pd.read_csv(r'/content/drive/MyDrive/data/val.txt', header=None, sep='\t')
val_data.columns = ['title', 'category']

test_data = pd.read_csv(r'/content/drive/MyDrive/data/test.txt', header=None, sep='\t')
test_data.columns = ['title', 'category']

In [None]:
def turning_words_into_numeric(df):
    data = ' '.join(df['title'].tolist())
    tokens = data.split()
    counter = Counter(tokens).most_common(len(tokens))
    title_int = {}
    for i, (token, freq) in enumerate(counter):
        if i == 0:
            title_int[token] = 1
        elif i == 1:
            title_int[token] = 2
        elif freq < 2:
            title_int[token] = 0
        else:
            title_int[token] = i + 1

    return title_int

In [None]:
train_title_int = turning_words_into_numeric(train_data)

train_title_int
train_data['title_int_id'] = train_data['title'].apply(lambda title: [train_title_int[word] for word in title.split()])

val_title_int = turning_words_into_numeric(val_data)
val_data['title_int_id'] = val_data['title'].apply(lambda title: [val_title_int[word] for word in title.split()])

test_title_int = turning_words_into_numeric(test_data)
test_data['title_int_id'] = test_data['title'].apply(lambda title: [test_title_int[word] for word in title.split()])

In [None]:
def pad_sequence(data, max_length):
    padded_data = []
    for sample in data:
        if len(sample) < max_length:
            padded_sample = sample + [0] * (max_length - len(sample))  # Padding with zeros
            padded_data.append(padded_sample)
        else:
            padded_data.append(sample[:max_length])  # Truncate if longer than max_length
    return padded_data


max_length = max(max(len(sample) for sample in train_data['title_int_id']),
                 max(len(sample) for sample in val_data['title_int_id']),
                 max(len(sample) for sample in test_data['title_int_id']))

train_data['title_int_id'] = pad_sequence(train_data['title_int_id'].values, max_length)
val_data['title_int_id'] = pad_sequence(val_data['title_int_id'].values, max_length)
test_data['title_int_id'] = pad_sequence(test_data['title_int_id'].values, max_length)

In [None]:
class NewsTitleDataset(Dataset):
    def __init__(self, data):
        self.X = data['title_int_id'].values
        self.y = data['category'].values

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.y[idx], dtype=torch.long)

In [None]:
train_dataset = NewsTitleDataset(train_data)
val_dataset = NewsTitleDataset(val_data)
test_dataset = NewsTitleDataset(test_data)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=2)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, num_workers=2)

In [None]:
class NewsClassify(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes=2):
        super(NewsClassify,  self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embed = self.embedding(x)
        output, hidden = self.rnn(embed)
        out = self.fc(output[:, -1, :])

        return out

In [None]:
vocab_size = len(train_title_int)
embedding_dim = 300
hidden_dim = 50
model = NewsClassify(vocab_size + 1, embedding_dim, hidden_dim, 4)

# for features, labels in train_dataloader:
#     y = model(features)
#     print(labels)

In [None]:
optimizer = optim.SGD(params=model.parameters(), lr=0.005)
criteria = nn.CrossEntropyLoss()

train_accuracies = []
val_accuracies = []
test_accuracies = []

for epoch in range(10):
    model.train()
    for features, labels in train_dataloader:
        outputs = model(features)
        loss = criteria(outputs, labels.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

    model.eval()

    train_acc = Accuracy(task="multiclass", num_classes=4)
    with torch.no_grad():
        for features, labels in train_dataloader:
            outputs = model(features)
            predicts = torch.argmax(outputs, dim=1)
            train_acc(predicts, labels.squeeze())

    train_accuracy = train_acc.compute()
    train_accuracies.append(train_accuracy)

    val_acc = Accuracy(task="multiclass", num_classes=4)
    with torch.no_grad():
        for features, labels in val_dataloader:
            outputs = model(features)
            predicts = torch.argmax(outputs, dim=1)
            val_acc(predicts, labels.squeeze())

    val_accuracy = val_acc.compute()
    val_accuracies.append(val_accuracy)

    test_acc = Accuracy(task="multiclass", num_classes=4)
    with torch.no_grad():
        for features, labels in test_dataloader:
            outputs = model(features)
            predicts = torch.argmax(outputs, dim=1)
            test_acc(predicts, labels.squeeze())

    test_accuracy = test_acc.compute()
    test_accuracies.append(test_accuracy)
    print(f"Epoch {epoch+1}, Train Accuracy: {train_accuracy}, Dev Accuracy: {val_accuracy}, Test Accuracy: {test_accuracy}")

  self.pid = os.fork()


Epoch 1, Train Accuracy: 0.42803969979286194, Dev Accuracy: 0.40229007601737976, Test Accuracy: 0.40229007601737976
Epoch 2, Train Accuracy: 0.42803969979286194, Dev Accuracy: 0.40229007601737976, Test Accuracy: 0.40229007601737976
Epoch 3, Train Accuracy: 0.42803969979286194, Dev Accuracy: 0.40229007601737976, Test Accuracy: 0.40229007601737976
Epoch 4, Train Accuracy: 0.42803969979286194, Dev Accuracy: 0.40229007601737976, Test Accuracy: 0.40229007601737976
Epoch 5, Train Accuracy: 0.42803969979286194, Dev Accuracy: 0.40229007601737976, Test Accuracy: 0.40229007601737976
Epoch 6, Train Accuracy: 0.42803969979286194, Dev Accuracy: 0.40229007601737976, Test Accuracy: 0.40229007601737976
Epoch 7, Train Accuracy: 0.42803969979286194, Dev Accuracy: 0.40229007601737976, Test Accuracy: 0.40229007601737976
Epoch 8, Train Accuracy: 0.42546287178993225, Dev Accuracy: 0.40229007601737976, Test Accuracy: 0.4030534327030182
Epoch 9, Train Accuracy: 0.42546287178993225, Dev Accuracy: 0.40229007601

In [None]:
from gensim.models import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/data/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
word_to_index = word_vectors.key_to_index
train_data['word_embed_id'] = train_data['title'].apply(lambda title: [word_to_index.get(word, 0) for word in title.split()])
val_data['word_embed_id'] = val_data['title'].apply(lambda title: [word_to_index.get(word, 0) for word in title.split()])
test_data['word_embed_id'] = test_data['title'].apply(lambda title: [word_to_index.get(word, 0) for word in title.split()])

In [None]:
train_data['word_embed_id'] = pad_sequence(train_data['word_embed_id'].values, max_length)
val_data['word_embed_id'] = pad_sequence(val_data['word_embed_id'].values, max_length)
test_data['word_embed_id'] = pad_sequence(test_data['word_embed_id'].values, max_length)

In [None]:
class NewsDataset(Dataset):
    def __init__(self, data):
        super(NewsDataset, self).__init__()
        self.X = data['word_embed_id']
        self.y = data['category']
        self.word_to_index = word_to_index

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor([self.y[idx]], dtype=torch.long)

    def __len__(self):
        return len(self.y)

In [None]:
train_dataset = NewsDataset(train_data)
val_dataset = NewsDataset(val_data)
test_dataset = NewsDataset(test_data)

train_dataloader = DataLoader(train_dataset, batch_size=32, num_workers=2)
val_dataloader = DataLoader(val_dataset, batch_size=32, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=32, num_workers=2)

In [None]:
class NewsClassify(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pretrained_word2vec: None, num_classes=2, num_layers=2):
        super(NewsClassify,  self).__init__()
        if pretrained_word2vec != None:
            weights = torch.FloatTensor(pretrained_word2vec.vectors)
            self.embedding = nn.Embedding.from_pretrained(weights, padding_idx=0)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(2 * hidden_dim, num_classes)

    def forward(self, x):
        embed = self.embedding(x)
        output, hidden = self.rnn(embed)

        out = self.fc(output[:, -1, :])

        return out

In [None]:
vocab_size = len(word_to_index)
embedding_dim = 300
hidden_dim = 50
model = NewsClassify(vocab_size + 1, embedding_dim, hidden_dim, word_vectors, 4)

In [None]:
optimizer = optim.SGD(params=model.parameters(), lr=0.005)
criteria = nn.CrossEntropyLoss()

train_accuracies = []
val_accuracies = []
test_accuracies = []

for epoch in range(10):
    model.train()
    for features, labels in train_dataloader:
        outputs = model(features)
        loss = criteria(outputs, labels.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

    model.eval()

    train_acc = Accuracy(task="multiclass", num_classes=4)
    with torch.no_grad():
        for features, labels in train_dataloader:
            outputs = model(features)
            predicts = torch.argmax(outputs, dim=1)
            train_acc(predicts, labels.squeeze())

    train_accuracy = train_acc.compute()
    train_accuracies.append(train_accuracy)

    val_acc = Accuracy(task="multiclass", num_classes=4)
    with torch.no_grad():
        for features, labels in val_dataloader:
            outputs = model(features)
            predicts = torch.argmax(outputs, dim=1)
            val_acc(predicts, labels.squeeze())

    val_accuracy = val_acc.compute()
    val_accuracies.append(val_accuracy)

    test_acc = Accuracy(task="multiclass", num_classes=4)
    with torch.no_grad():
        for features, labels in test_dataloader:
            outputs = model(features)
            predicts = torch.argmax(outputs, dim=1)
            test_acc(predicts, labels.squeeze())

    test_accuracy = test_acc.compute()
    test_accuracies.append(test_accuracy)
    print(f"Epoch {epoch+1}, Train Accuracy: {train_accuracy}, Dev Accuracy: {val_accuracy}, Test Accuracy: {test_accuracy}")

  self.pid = os.fork()


Epoch 1, Train Accuracy: 0.42574918270111084, Dev Accuracy: 0.4068702161312103, Test Accuracy: 0.4038167893886566
Epoch 2, Train Accuracy: 0.42574918270111084, Dev Accuracy: 0.4068702161312103, Test Accuracy: 0.4038167893886566
Epoch 3, Train Accuracy: 0.42574918270111084, Dev Accuracy: 0.4068702161312103, Test Accuracy: 0.4038167893886566
Epoch 4, Train Accuracy: 0.42574918270111084, Dev Accuracy: 0.4068702161312103, Test Accuracy: 0.4038167893886566
Epoch 5, Train Accuracy: 0.42574918270111084, Dev Accuracy: 0.4068702161312103, Test Accuracy: 0.4038167893886566
Epoch 6, Train Accuracy: 0.42574918270111084, Dev Accuracy: 0.4068702161312103, Test Accuracy: 0.4038167893886566
Epoch 7, Train Accuracy: 0.42584463953971863, Dev Accuracy: 0.4068702161312103, Test Accuracy: 0.4038167893886566
Epoch 8, Train Accuracy: 0.42584463953971863, Dev Accuracy: 0.4068702161312103, Test Accuracy: 0.4038167893886566
Epoch 9, Train Accuracy: 0.42584463953971863, Dev Accuracy: 0.4068702161312103, Test Acc

In [None]:
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, pretrained_word2vec: None):
        super(CNNModel, self).__init__()
        self.dropout = nn.Dropout(0.9)
        if pretrained_word2vec != None:
            weights = torch.FloatTensor(pretrained_word2vec.vectors)
            self.embedding = nn.Embedding.from_pretrained(weights, padding_idx=0)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.conv = nn.Conv1d(embedding_dim, hidden_dim, kernel_size=3, padding=1, bias=False)
        self.pool = nn.MaxPool1d(kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(hidden_dim, 10)
        self.fc2 = nn.Linear(10, 10)
        self.fc3 = nn.Linear(10, 10)
        self.fc4 = nn.Linear(10, num_classes)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x)) # shape: (batch_size, seq_len, embedding_dim)
        embedded = embedded.permute(0, 2, 1) # shape: (batch_size, embedding_dim, seq_len)
        conv_out = self.conv(embedded)  # shape: (batch_size, hidden_dim, seq_len)
        pooled = self.pool(conv_out) # shape: (batch_size, hidden_dim, seq_len)
        pooled, _ = torch.max(pooled, dim=2) # shape: (batch_size, hidden_dim)
        output = self.fc(pooled)  # shape: (batch_size, num_classes)
        output = self.fc2(output)
        output = self.fc3(output)
        output = self.fc4(output)

        return output


In [None]:
model = CNNModel(vocab_size, embedding_dim, hidden_dim, 4, word_vectors)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.006, momentum=0.9, weight_decay=0.001)

num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.view(-1))
        loss.backward()
        optimizer.step()

    model.eval()

    train_acc = Accuracy(task="multiclass", num_classes=4)
    with torch.no_grad():
        for features, labels in train_dataloader:
            outputs = model(features)
            predicts = torch.argmax(outputs, dim=1)
            train_acc(predicts, labels.squeeze())

    train_accuracy = train_acc.compute()
    train_accuracies.append(train_accuracy)

    val_acc = Accuracy(task="multiclass", num_classes=4)
    with torch.no_grad():
        for features, labels in val_dataloader:
            outputs = model(features)
            predicts = torch.argmax(outputs, dim=1)
            val_acc(predicts, labels.squeeze())

    val_accuracy = val_acc.compute()
    val_accuracies.append(val_accuracy)

    test_acc = Accuracy(task="multiclass", num_classes=4)
    with torch.no_grad():
        for features, labels in test_dataloader:
            outputs = model(features)
            predicts = torch.argmax(outputs, dim=1)
            test_acc(predicts, labels.squeeze())

    test_accuracy = test_acc.compute()
    test_accuracies.append(test_accuracy)
    print(f"Epoch {epoch+1}, Train Accuracy: {train_accuracy}, Dev Accuracy: {val_accuracy}, Test Accuracy: {test_accuracy}")

Epoch 1, Train Accuracy: 0.6953617334365845, Dev Accuracy: 0.676335871219635, Test Accuracy: 0.7099236845970154
Epoch 2, Train Accuracy: 0.7439396977424622, Dev Accuracy: 0.7221373915672302, Test Accuracy: 0.743511438369751
Epoch 3, Train Accuracy: 0.7543424367904663, Dev Accuracy: 0.7312977313995361, Test Accuracy: 0.7526717782020569
Epoch 4, Train Accuracy: 0.7728574275970459, Dev Accuracy: 0.7526717782020569, Test Accuracy: 0.7687022686004639
Epoch 5, Train Accuracy: 0.7731437087059021, Dev Accuracy: 0.7534351348876953, Test Accuracy: 0.7641221284866333
Epoch 6, Train Accuracy: 0.7807787656784058, Dev Accuracy: 0.761832058429718, Test Accuracy: 0.7740458250045776
Epoch 7, Train Accuracy: 0.7861233353614807, Dev Accuracy: 0.7679389119148254, Test Accuracy: 0.7793893218040466
Epoch 8, Train Accuracy: 0.7844054102897644, Dev Accuracy: 0.767175555229187, Test Accuracy: 0.7763358950614929
Epoch 9, Train Accuracy: 0.7924222350120544, Dev Accuracy: 0.7740458250045776, Test Accuracy: 0.7877

In [6]:
class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        title = self.data.iloc[idx]['title']
        category = self.data.iloc[idx]['category']

        encoding = self.tokenizer(title, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return input_ids, attention_mask, category

In [7]:
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = NewsDataset(train_data, tokenizer, max_length=100)
val_dataset = NewsDataset(val_data, tokenizer, max_length=100)
test_dataset = NewsDataset(test_data, tokenizer, max_length=100)

train_dataloader = DataLoader(train_dataset, batch_size = 128, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size = 128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size = 128, shuffle=True)

In [None]:
from torchmetrics import Accuracy

class NewsClassifier(nn.Module):
    def __init__(self, num_classes):
        super(NewsClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return linear_output

model = NewsClassifier(num_classes=4)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for input_ids, attention_mask, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    model.eval()

    train_acc = Accuracy(task="multiclass", num_classes=4)
    with torch.no_grad():
        for input_ids, attention_mask, labels in train_dataloader:
            outputs = model(input_ids, attention_mask)
            predicts = torch.argmax(outputs, dim=1)
            train_acc(predicts, labels)

    train_accuracy = train_acc.compute()

    val_acc = Accuracy(task="multiclass", num_classes=4)
    with torch.no_grad():
        for input_ids, attention_mask, labels in val_dataloader:
            outputs = model(input_ids, attention_mask)
            predicts = torch.argmax(outputs, dim=1)
            val_acc(predicts, labels)

    val_accuracy = val_acc.compute()

    test_acc = Accuracy(task="multiclass", num_classes=4)
    with torch.no_grad():
        for input_ids, attention_mask, labels in test_dataloader:
            outputs = model(input_ids, attention_mask)
            predicts = torch.argmax(outputs, dim=1)
            test_acc(predicts, labels)

    test_accuracy = test_acc.compute()

    print(f"Epoch {epoch+1}, Train Accuracy: {train_accuracy}, Dev Accuracy: {val_accuracy}, Test Accuracy: {test_accuracy}")

