<h3>Classify news articles into categories using TorchText</h3>
<p>The notebook includes the steps to build a text classifier using AG News dataset in TorchText. This also shows how to use TorchText functions to explore datasets for NLP and reduce overhead of preprocessing.</p>

In [1]:
import torch
import torchtext
from torchtext.datasets import text_classification
import os

In [2]:
data_dir = "pytorch_text_classification"
NGRAMS = 2

if not os.path.isdir(data_dir):
    os.mkdir(data_dir)
    
# Download and load dataset
train_dataset, test_dataset = text_classification.DATASETS["AG_NEWS"](root=data_dir, ngrams=NGRAMS, vocab=None)

120000lines [00:07, 15820.59lines/s]
120000lines [00:14, 8298.87lines/s]
7600lines [00:01, 7440.87lines/s]


In [3]:
train_dataset[0]

(2,
 tensor([    572,     564,       2,    2326,   49106,     150,      88,       3,
            1143,      14,      32,      15,      32,      16,  443749,       4,
             572,     499,      17,      10,  741769,       7,  468770,       4,
              52,    7019,    1050,     442,       2,   14341,     673,  141447,
          326092,   55044,    7887,     411,    9870,  628642,      43,      44,
             144,     145,  299709,  443750,   51274,     703,   14312,      23,
         1111134,  741770,  411508,  468771,    3779,   86384,  135944,  371666,
            4052]))

In [4]:
import torch.nn as nn
import torch.nn.functional as F

# Define a class for text
class TextTopic(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.output = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        init_range = 0.5
        self.embedding.weight.data.uniform_(-init_range, init_range)
        self.output.weight.data.uniform_(-init_range, init_range)
        self.output.bias.data.zero_()

    def forward(self, text, offsets):
        # A row of data is array of word indexes
        # text is concat of index arrays in a batch of data
        # If there are three texts with lengths 3, 4, 6
        # then, length of text: 13 and offsets: [0, 3, 7]
        embedded = self.embedding(text, offsets)
        output = self.output(embedded)
        return output

In [5]:
BATCH_SIZE = 16
EMBED_DIM = 32
N_EPOCHS = 5
min_valid_loss = float('inf')

In [6]:
import time
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

class TextClassifier(object):
    def __init__(self, train_dataset, test_dataset):
        # storage of all words in a train dataset
        self.vocab = train_dataset.get_vocab()
        
        # There are 4 classes in ag-news-dataset
        self.num_classes = len(train_dataset.get_labels())
        
        train_len = int(len(train_dataset) * 0.9)
        valid_len = len(train_dataset) - train_len
        
        self.train_data, self.valid_data = random_split(train_dataset, [train_len, valid_len])
        self.test_data = test_dataset
        print("Train data: {0}, Validation data: {1}, Test data: {2}"
              .format(len(self.train_data), len(self.valid_data), len(self.test_data)))

        # Initialize model, optimizer and loss
        self.model = TextTopic(len(self.vocab), EMBED_DIM, self.num_classes)
        self.criterion = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=4.0)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1, gamma=0.9)
        
    def generate_batch(self, batch):
        # Generate features for one batch
        label = torch.tensor([entry[0] for entry in batch])
        text = [entry[1] for entry in batch]
        
        offsets = [0] + [len(entry) for entry in text]
        offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)

        text = torch.cat(text)
        return text, offsets, label    

    def train_util(self):
        train_loss, train_acc = 0, 0        
        data = DataLoader(self.train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=self.generate_batch)

        for i, (text, offsets, cls) in enumerate(data):
            # Train the model with one batch of data
            self.optimizer.zero_grad()

            output = self.model(text, offsets)
            loss = self.criterion(output, cls)
            train_loss += loss.item()

            loss.backward()
            self.optimizer.step()
            train_acc += (output.argmax(1) == cls).sum().item()

        # Adjust the learning rate
        self.scheduler.step()

        return train_loss / len(self.train_data), train_acc / len(self.train_data)

    def evaluate_util(self, test_data):
        loss, acc = 0, 0
        data = DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=self.generate_batch)

        for text, offsets, cls in data:
            # Evaluate model without gradient
            with torch.no_grad():
                output = self.model(text, offsets)
                loss = self.criterion(output, cls)
                loss += loss.item()
                acc += (output.argmax(1) == cls).sum().item()

        return loss / len(test_data), acc / len(test_data)    
    
    def train(self):
        for epoch in range(N_EPOCHS):
            start_time = time.time()
            train_loss, train_acc = self.train_util()
            valid_loss, valid_acc = self.evaluate_util(self.valid_data)

            secs = int(time.time() - start_time)
            mins = secs / 60
            secs = secs % 60

            print('\nEpoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
            print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
            print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')
            
    def evaluate(self):
        print('Checking the results of test dataset...')
        test_loss, test_acc = self.evaluate_util(self.test_data)
        print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')
        
    def predict(self, text, ngrams=2):
        tokenizer = get_tokenizer("basic_english")
        
        with torch.no_grad():
            # Create feature row for prediction
            text = [self.vocab[token] for token in ngrams_iterator(tokenizer(text), ngrams)]

            # Get output from the model
            output = self.model(torch.tensor(text), torch.tensor([0]))
            result = output.argmax(1).item() + 1 
            
        return result    

In [7]:
model = TextClassifier(train_dataset, test_dataset)
model.train()

Train data: 108000, Validation data: 12000, Test data: 7600

Epoch: 1  | time in 0 minutes, 21 seconds
	Loss: 0.0264(train)	|	Acc: 84.5%(train)
	Loss: 0.0001(valid)	|	Acc: 90.0%(valid)

Epoch: 2  | time in 0 minutes, 20 seconds
	Loss: 0.0119(train)	|	Acc: 93.6%(train)
	Loss: 0.0001(valid)	|	Acc: 90.8%(valid)

Epoch: 3  | time in 0 minutes, 22 seconds
	Loss: 0.0069(train)	|	Acc: 96.4%(train)
	Loss: 0.0002(valid)	|	Acc: 90.5%(valid)

Epoch: 4  | time in 0 minutes, 21 seconds
	Loss: 0.0038(train)	|	Acc: 98.2%(train)
	Loss: 0.0002(valid)	|	Acc: 91.0%(valid)

Epoch: 5  | time in 0 minutes, 24 seconds
	Loss: 0.0022(train)	|	Acc: 99.0%(train)
	Loss: 0.0001(valid)	|	Acc: 91.0%(valid)


In [8]:
model.evaluate()

Checking the results of test dataset...
	Loss: 0.0003(test)	|	Acc: 90.7%(test)


In [9]:
ag_news_label = {1 : "World",
                 2 : "Sports",
                 3 : "Business",
                 4 : "Sci/Tec"}

text = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

print("This is a %s news" % ag_news_label[model.predict(text)])

This is a Sports news
