In [4]:
import torch
import torchtext
from torchtext import data

TEXT = data.Field(tokenize="spacy", batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)

I0323 07:25:04.268230 139862734321472 file_utils.py:32] TensorFlow version 2.1.0 available.
I0323 07:25:04.269010 139862734321472 file_utils.py:39] PyTorch version 1.4.0 available.
I0323 07:25:05.923982 139862734321472 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
fields = [('label', LABEL), (None, None), ('text',TEXT)]

In [10]:
training_data=data.TabularDataset(path = 'AG_news/train.csv',format = 'csv',fields = fields,skip_header = True)

In [11]:
print(vars(training_data.examples[0]))

{'label': '3', 'text': ['Reuters', '-', 'Short', '-', 'sellers', ',', 'Wall', 'Street', "'s", 'dwindling\\band', 'of', 'ultra', '-', 'cynics', ',', 'are', 'seeing', 'green', 'again', '.']}


In [15]:
train_data, valid_data = training_data.split(split_ratio=0.1)

In [16]:
#initialize glove embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
print(TEXT.vocab.stoi)   

I0323 07:32:59.541369 139862734321472 vocab.py:351] Downloading vectors from http://nlp.stanford.edu/data/glove.6B.zip
.vector_cache/glove.6B.zip: 862MB [18:19, 948kB/s]                                 
I0323 07:51:19.504166 139862734321472 vocab.py:362] Extracting vectors into .vector_cache
I0323 07:52:34.352173 139862734321472 vocab.py:374] Loading vectors from .vector_cache/glove.6B.100d.txt
100%|█████████▉| 399997/400000 [00:20<00:00, 21124.61it/s]I0323 07:53:03.069539 139862734321472 vocab.py:426] Saving vectors to .vector_cache/glove.6B.100d.txt.pt


Size of TEXT vocabulary: 11638
Size of LABEL vocabulary: 4
[('the', 17413), (',', 14549), ('.', 13204), ('-', 9737), ('to', 9629), ('a', 9539), ('of', 8965), ('in', 7630), ('and', 6627), ('on', 4702)]


In [18]:
batch_size = 64

train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data), batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True,
                                                           device=device)

In [143]:
# Create neural network representation
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class TextClassification(nn.Module):
    def __init__(self, vocabulary_size, embedding_size, hidden_size, num_layers, bidirectional, dropout, num_class):
        super().__init__()
        self.dropout = dropout
        self.embedding = nn.Embedding(vocabulary_size, embedding_size)
        
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional=bidirectional,
                          dropout=dropout, batch_first=True)
        
        if bidirectional:
            self.fc = nn.Linear(hidden_size*2, num_class)
        else:
            self.fc = nn.Linear(hidden_size, num_class)
        
        self.softmax = nn.LogSoftmax(dim=-1)
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        
        for name, param in self.rnn.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_normal_(param)
        
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, text, text_lengths):
        emb = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(emb, text_lengths,batch_first=True)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        
        fc = self.fc(hidden)
        return self.softmax(fc)
        

In [144]:
vocabulary_size = len(TEXT.vocab)
n_class = len(LABEL.vocab)
embedding_size = 100
hidden_counts = 75
n_layers = 2
bidirectional = True
dropout = 0.4

model = TextClassification(vocabulary_size, embedding_size, hidden_counts, n_layers, bidirectional, dropout, n_class)

In [145]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.0733,  1.1329,  0.1348,  ..., -0.7117,  0.7464,  0.3307],
        [ 0.1259, -0.0748,  0.1768,  ..., -0.1319,  0.8167, -0.2869],
        [ 0.4204,  0.3731,  1.0618,  ..., -0.7836, -0.0121,  0.1724]])

In [146]:
def training(model, iterator, optimizer, criterion):
    training_loss = 0
    training_accuracy = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        output = model(text, text_lengths).squeeze()
        
        loss = criterion(output, target)
        
        training_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()

        acc = num_corrects/len(batch)
        training_accuracy += acc.item()

    scheduler.step()
    
    return training_loss / len(iterator), training_accuracy / len(iterator)

def testing(model, iterator, optimizer, criterion):
    testing_loss = 0
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            loss = criterion(output, target)
            
            testing_loss += loss.item()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects/len(batch)
        
            testing_accuracy += acc.item()
            
    return testing_loss / len(iterator), testing_accuracy / len(iterator)

In [148]:
import time

n_epochs = 5
min_val_loss = float("inf")

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

for epoch in range(n_epochs):
    start_time = time.time()
    
    train_loss, train_acc = training(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = testing(model, valid_iterator, optimizer, criterion)
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.2f}%(train)')
    print(f'\tLoss: {val_loss:.4f}(valid)\t|\tAcc: {val_acc * 100:.2f}%(valid)')
    
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        torch.save(model.state_dict(), 'AG_news/model/saved_weights.pt')

Epoch: 1  | time in 2 minutes, 17 seconds
	Loss: 1.2790(train)	|	Acc: 40.68%(train)
	Loss: 0.8309(valid)	|	Acc: 65.19%(valid)
Epoch: 2  | time in 2 minutes, 21 seconds
	Loss: 0.6307(train)	|	Acc: 76.13%(train)
	Loss: 0.4266(valid)	|	Acc: 84.97%(valid)
Epoch: 3  | time in 1 minutes, 57 seconds
	Loss: 0.3876(train)	|	Acc: 86.25%(train)
	Loss: 0.4024(valid)	|	Acc: 85.89%(valid)
Epoch: 4  | time in 2 minutes, 8 seconds
	Loss: 0.3007(train)	|	Acc: 89.74%(train)
	Loss: 0.4084(valid)	|	Acc: 86.18%(valid)
Epoch: 5  | time in 2 minutes, 9 seconds
	Loss: 0.2452(train)	|	Acc: 91.76%(train)
	Loss: 0.3441(valid)	|	Acc: 88.14%(valid)


In [153]:
testing_data=data.TabularDataset(path = 'AG_news/test.csv',format = 'csv',fields = fields,skip_header = True)

In [154]:
testing_iterator = data.BucketIterator(testing_data, batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True,
                                                           device=device)

In [155]:
path='AG_news/model/saved_weights.pt'
model.load_state_dict(torch.load(path))

def predict(model, iterator):
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        # text = TEXT.preprocess(text)
        label = batch.label
        target = torch.autograd.Variable(label).long()
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects / len(batch)
            testing_accuracy += acc.item()
    
    return testing_accuracy / len(iterator)

In [152]:
test_acc = predict(model, testing_iterator)
print(f"Accuracy {test_acc * 100:.2f}")

Accuracy 87.94
