In [1]:
import torch
import torchtext
from torchtext import data

TEXT = data.Field(tokenize="spacy", batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)

I0325 08:12:00.750314 140435103909696 file_utils.py:32] TensorFlow version 2.1.0 available.
I0325 08:12:00.751554 140435103909696 file_utils.py:39] PyTorch version 1.4.0 available.
I0325 08:12:01.034390 140435103909696 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
fields = [('label', LABEL), (None, None), ('text',TEXT)]

In [4]:
training_data=data.TabularDataset(path = 'AG_news/train.csv',format = 'csv',fields = fields,skip_header = True)

In [5]:
print(vars(training_data.examples[0]))

{'label': '3', 'text': ['Reuters', '-', 'Short', '-', 'sellers', ',', 'Wall', 'Street', "'s", 'dwindling\\band', 'of', 'ultra', '-', 'cynics', ',', 'are', 'seeing', 'green', 'again', '.']}


In [7]:
train_data, valid_data = training_data.split(split_ratio=0.1)

In [14]:
#initialize glove embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "fasttext.simple.300d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
# print(TEXT.vocab.stoi)   

I0325 08:20:09.503787 140435103909696 vocab.py:431] Loading vectors from .vector_cache/wiki.simple.vec.pt


Size of TEXT vocabulary: 11561
Size of LABEL vocabulary: 4
[('the', 17182), (',', 14373), ('.', 13113), ('-', 9665), ('a', 9615), ('to', 9489), ('of', 8869), ('in', 7655), ('and', 6378), ('on', 4556)]


In [15]:
batch_size = 64

train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data), batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True,
                                                           device=device)

100%|█████████▉| 110761/111051 [00:30<00:00, 8445.62it/s]

In [128]:
# Create neural network representation
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class TextClassification(nn.Module):
    def __init__(self, vocabulary_size, embedding_size, num_class, lstm=True, hidden_size=100, num_layers=2, bidirectional=True, dropout=0.4):
        super().__init__()
        self.dropout = dropout
        self.lstm = lstm
        
        if lstm:
            self.embedding = nn.Embedding(vocabulary_size, embedding_size)
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional=bidirectional,
                              dropout=dropout, batch_first=True)

            if bidirectional:
                self.fc = nn.Linear(hidden_size*2, num_class)
            else:
                self.fc = nn.Linear(hidden_size, num_class)
        
        else:
            self.embedding = nn.EmbeddingBag(vocabulary_size, embedding_size)
            self.fc = nn.Linear(embedding_size, num_class)
        
        self.softmax = nn.LogSoftmax(dim=-1)
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        
        if self.lstm:
            for name, param in self.rnn.named_parameters():
                if 'bias' in name:
                    nn.init.constant_(param, 0.0)
                elif 'weight' in name:
                    nn.init.xavier_normal_(param)
        
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, text, text_lengths):
        emb = self.embedding(text)
        
        if self.lstm:
            packed_embedded = nn.utils.rnn.pack_padded_sequence(emb, text_lengths, batch_first=True)
            packed_output, (hidden, cell) = self.rnn(packed_embedded)

            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)

            fc = self.fc(hidden)
        
        else:
            packed_embedded = nn.utils.rnn.pad_sequence(emb,batch_first=True)
            fc = self.fc(packed_embedded)
            
        return self.softmax(fc)
        

In [129]:
vocabulary_size = len(TEXT.vocab)
n_class = len(LABEL.vocab)
embedding_size = 300
hidden_counts = 75
n_layers = 2
bidirectional = True
dropout = 0.4
lstm = True

model = TextClassification(vocabulary_size, embedding_size, n_class, lstm=lstm, hidden_size=hidden_counts, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)

In [130]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0104, -0.1829,  0.0761,  ..., -0.1362, -0.2240, -0.0552],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.6078,  0.2664,  0.2431,  ...,  0.2866,  0.1220,  0.1763],
        [ 0.6539, -0.0272, -0.4260,  ..., -0.0408,  0.5972, -0.1600]])

In [131]:
def training(model, iterator, optimizer, criterion):
    training_loss = 0
    training_accuracy = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        output = model(text, text_lengths).squeeze()
        
        loss = criterion(output, target)
        
        training_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()

        acc = num_corrects/len(batch)
        training_accuracy += acc.item()

    scheduler.step()
    
    return training_loss / len(iterator), training_accuracy / len(iterator)

def testing(model, iterator, optimizer, criterion):
    testing_loss = 0
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            loss = criterion(output, target)
            
            testing_loss += loss.item()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects/len(batch)
        
            testing_accuracy += acc.item()
            
    return testing_loss / len(iterator), testing_accuracy / len(iterator)

In [137]:
import time

n_epochs = 8
min_val_loss = float("inf")

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)
if lstm:
    path='AG_news/model/saved_weights_fasttext_lstm.pt'
else:
    path='AG_news/model/saved_weights_fasttext.pt'
            
for epoch in range(n_epochs):
    start_time = time.time()
    
    train_loss, train_acc = training(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = testing(model, valid_iterator, optimizer, criterion)
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.2f}%(train)')
    print(f'\tLoss: {val_loss:.4f}(valid)\t|\tAcc: {val_acc * 100:.2f}%(valid)')
    
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        torch.save(model.state_dict(), path)

Epoch: 1  | time in 2 minutes, 24 seconds
	Loss: 0.3014(train)	|	Acc: 89.62%(train)
	Loss: 0.4076(valid)	|	Acc: 86.62%(valid)
Epoch: 2  | time in 2 minutes, 17 seconds
	Loss: 0.2233(train)	|	Acc: 92.46%(train)
	Loss: 0.4165(valid)	|	Acc: 86.96%(valid)
Epoch: 3  | time in 2 minutes, 18 seconds
	Loss: 0.1721(train)	|	Acc: 94.41%(train)
	Loss: 0.3858(valid)	|	Acc: 87.84%(valid)
Epoch: 4  | time in 2 minutes, 11 seconds
	Loss: 0.1348(train)	|	Acc: 95.75%(train)
	Loss: 0.4115(valid)	|	Acc: 87.74%(valid)
Epoch: 5  | time in 2 minutes, 21 seconds
	Loss: 0.1115(train)	|	Acc: 96.39%(train)
	Loss: 0.4441(valid)	|	Acc: 87.72%(valid)
Epoch: 6  | time in 2 minutes, 7 seconds
	Loss: 0.0872(train)	|	Acc: 97.44%(train)
	Loss: 0.4970(valid)	|	Acc: 87.74%(valid)
Epoch: 7  | time in 2 minutes, 17 seconds
	Loss: 0.0705(train)	|	Acc: 97.89%(train)
	Loss: 0.5082(valid)	|	Acc: 87.69%(valid)
Epoch: 8  | time in 2 minutes, 13 seconds
	Loss: 0.0561(train)	|	Acc: 98.51%(train)
	Loss: 0.5641(valid)	|	Acc: 87.66%(

In [138]:
testing_data=data.TabularDataset(path = 'AG_news/test.csv',format = 'csv',fields = fields,skip_header = True)

In [139]:
testing_iterator = data.BucketIterator(testing_data, batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True,
                                                           device=device)

In [140]:
model.load_state_dict(torch.load(path))

def predict(model, iterator):
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        # text = TEXT.preprocess(text)
        label = batch.label
        target = torch.autograd.Variable(label).long()
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects / len(batch)
            testing_accuracy += acc.item()
    
    return testing_accuracy / len(iterator)

In [141]:
test_acc = predict(model, testing_iterator)
print(f"Accuracy {test_acc * 100:.2f}")

Accuracy 87.60
