In [1]:
import torch
import torchtext
from torchtext.datasets import text_classification

import os
if not os.path.isdir('./.data'):
    os.mkdir('./.data')
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./.data', vocab=None)
BATCH_SIZE = 16

ag_news_csv.tar.gz: 11.8MB [00:04, 2.52MB/s]
120000lines [00:08, 13810.63lines/s]
120000lines [00:17, 6743.83lines/s]
7600lines [00:01, 6452.86lines/s]


In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Create Validation split

In [2]:
from torch.utils.data.dataset import random_split

train_size = int(len(train_dataset) * 0.95)

training_set, validation_set = random_split(train_dataset, [train_size, int(len(train_dataset)) - train_size])

In [185]:
# Create a generator function

def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    
    # Since text data has variable length find the offset to find starting index of a new sentence
    offset = [0] + [len(entry) for entry in text]
    
    offset = torch.tensor(offset[:-1]).cumsum(dim=0)
    
    text = torch.cat(text)
    return text, offset, label

In [200]:
# Create neural network representation
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class TextClassification(nn.Module):
    def __init__(self, vocabulary_size, embedding_size, num_class):
        super().__init__()
        self.embedding_size = embedding_size
        self.embedding = nn.EmbeddingBag(vocabulary_size, embedding_size, sparse=True)
        self.fc = nn.Linear(embedding_size, num_class)
        self.softmax = nn.LogSoftmax(dim=-1)
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, text, offset):
        emb = self.embedding(text, offset)
        fc = self.fc(emb)
        return self.softmax(fc)
        

In [201]:
from torch.utils.data import DataLoader

vocab_size = len(train_dataset.get_vocab())
emb_size = 100
num_class = len(train_dataset.get_labels())
batch_size = 32

model = TextClassification(vocab_size, emb_size, num_class).to(device)

In [202]:
def training(dataset, batch_size):
    training_loss = 0
    training_accuracy = 0
    
    data = DataLoader(dataset, batch_size, shuffle=True, collate_fn=generate_batch)
    
    for i, (text, offset, cls) in enumerate(data):
        optimizer.zero_grad()
        
        text, offset, cls = text.to(device), offset.to(device), cls.to(device)
        
        output = model(text, offset)
        
        loss = criterion(output, cls)
        
        training_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        training_accuracy += (output.argmax(1) == cls).sum().item()
    
    scheduler.step()
    
    return training_loss / len(dataset), training_accuracy / len(dataset)

def testing(dataset, batch):
    testing_loss = 0
    testing_accuracy = 0
    
    data = DataLoader(dataset, batch_size, shuffle=True, collate_fn=generate_batch)
    
    for text, offset, cls in data:
        text, offset, cls = text.to(device), offset.to(device), cls.to(device)
        
        with torch.no_grad():
            output = model(text, offset)
            loss = criterion(output, cls)
            
            testing_loss += loss.item()
            testing_accuracy += (output.argmax(1) == cls).sum().item()
    
    return testing_loss / len(dataset), testing_accuracy / len(dataset)

In [203]:
import time

n_epochs = 5
min_val_loss = float("inf")

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

for epoch in range(n_epochs):
    start_time = time.time()
    
    train_loss, train_acc = training(train_dataset, batch_size)
    val_loss, val_acc = testing(validation_set, batch_size)
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {val_loss:.4f}(valid)\t|\tAcc: {val_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 28 seconds
	Loss: 0.0141(train)	|	Acc: 83.7%(train)
	Loss: 0.0071(valid)	|	Acc: 92.5%(valid)
Epoch: 2  | time in 0 minutes, 25 seconds
	Loss: 0.0070(train)	|	Acc: 92.6%(train)
	Loss: 0.0044(valid)	|	Acc: 95.6%(valid)
Epoch: 3  | time in 0 minutes, 31 seconds
	Loss: 0.0047(train)	|	Acc: 95.2%(train)
	Loss: 0.0030(valid)	|	Acc: 97.1%(valid)
Epoch: 4  | time in 0 minutes, 29 seconds
	Loss: 0.0031(train)	|	Acc: 96.9%(train)
	Loss: 0.0030(valid)	|	Acc: 96.8%(valid)
Epoch: 5  | time in 0 minutes, 31 seconds
	Loss: 0.0020(train)	|	Acc: 98.2%(train)
	Loss: 0.0014(valid)	|	Acc: 98.9%(valid)


In [204]:
print('Checking the results of test dataset...')
test_loss, test_acc = testing(test_dataset, batch_size)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
	Loss: 0.0097(test)	|	Acc: 90.9%(test)
