In [6]:
import torch
import torchtext
from torchtext import data

TEXT = data.Field(tokenize="spacy", batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)

I0324 08:01:07.866112 139780011603776 file_utils.py:32] TensorFlow version 2.1.0 available.
I0324 08:01:07.867089 139780011603776 file_utils.py:39] PyTorch version 1.4.0 available.
I0324 08:01:09.046508 139780011603776 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
fields = [('label', LABEL), (None, None), ('text',TEXT)]

In [9]:
training_data=data.TabularDataset(path = 'AG_news/train.csv',format = 'csv',fields = fields,skip_header = True)

In [10]:
print(vars(training_data.examples[0]))

{'label': '3', 'text': ['Reuters', '-', 'Short', '-', 'sellers', ',', 'Wall', 'Street', "'s", 'dwindling\\band', 'of', 'ultra', '-', 'cynics', ',', 'are', 'seeing', 'green', 'again', '.']}


In [11]:
train_data, valid_data = training_data.split(split_ratio=0.1)

In [12]:
#initialize glove embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
print(TEXT.vocab.stoi)   

I0324 08:01:48.349673 139780011603776 vocab.py:431] Loading vectors from .vector_cache/glove.6B.100d.txt.pt


Size of TEXT vocabulary: 11607
Size of LABEL vocabulary: 4
[('the', 17399), (',', 14418), ('.', 13227), ('-', 9702), ('a', 9559), ('to', 9521), ('of', 9008), ('in', 7552), ('and', 6613), ('on', 4698)]


In [13]:
batch_size = 64

train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data), batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True,
                                                           device=device)

In [45]:
# Create neural network representation
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

class Attention(nn.Module):
    def __init__(self, hidden_size, batch_first=False):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.batch_first = batch_first
        
        self.attn_weights = nn.Parameter(torch.Tensor(1, hidden_size))
        
        stdv = 1.0 / np.sqrt(self.hidden_size)
        for weight in self.attn_weights:
            nn.init.uniform_(weight, -stdv, stdv)
            
    def get_mask(self):
        pass
    
    def forward(self, inputs, lengths):
        if self.batch_first:
            batch_size, max_len = inputs.size()[:2]
        else:
            max_len, batch_size = inputs.size()[:2]
            
        attn_weights = torch.bmm(inputs, self.attn_weights.permute(1, 0).unsqueeze(0).repeat(batch_size, 1, 1))
        
        attentions = F.softmax(F.relu(attn_weights.squeeze()), dim=-1)
        
        # create mask based on maximum length and pad 0s
        mask = torch.ones(attentions.size())
        
        for i, l in enumerate(lengths):
            if l < max_len:
                mask[i, l:] = 0
        
        # apply mask and normalize attention scores
        masked = attentions * mask
        
        _sums = masked.sum(-1).unsqueeze(-1)
        
        attentions = masked.div(_sums)
        
        # apply attention weights
        weighted = torch.mul(inputs, attentions.unsqueeze(-1).expand_as(inputs))

        # get the final fixed vector representations of the sentences
        representations = weighted.sum(1).squeeze()

        return representations, attentions

In [46]:
class TextClassification(nn.Module):
    def __init__(self, batch_size, vocabulary_size, embedding_size, hidden_size, num_layers, bidirectional, dropout, num_class):
        super().__init__()
        self.batch_size = batch_size
        self.dropout = nn.Dropout(p=dropout)
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocabulary_size, embedding_size)
        self.lstm_layer = num_layers
        
        self.rnn1 = nn.LSTM(self.embedding.embedding_dim, self.hidden_size, bidirectional=bidirectional, batch_first=True)
        
        if bidirectional:
            self.attn1 = Attention(self.hidden_size*2, batch_first=True)
            self.rnn2 = nn.LSTM(self.hidden_size*2, self.hidden_size,
                                bidirectional=bidirectional, batch_first=True)
            
            self.attn2 = Attention(self.hidden_size*2, batch_first=True)
            self.fc1 = nn.Sequential(nn.Linear(self.hidden_size*self.lstm_layer*2,
                                               self.hidden_size*self.lstm_layer*2),
                                     nn.BatchNorm1d(self.hidden_size*self.lstm_layer*2),
                                     nn.ReLU()) 
            
            self.fc2 = nn.Linear(self.hidden_size*self.lstm_layer*2, num_class)
        
        
        else:
            self.attn1 = Attention(self.hidden_size, batch_first=True)
            self.rnn2 = nn.LSTM(self.hidden_size, hidden_size, bidirectional=bidirectional, batch_first=True)
            self.attn2 = Attention(self.hidden_size, batch_first=True)
            self.fc1 = nn.Sequential(nn.Linear(self.hidden_size*self.lstm_layer,
                                               self.hidden_size*self.lstm_layer),
                                     nn.BatchNorm1d(self.hidden_size*self.lstm_layer),
                                     nn.ReLU()) 
            
            self.fc2 = nn.Linear(self.hidden_size*self.lstm_layer, num_class)
            
        self.softmax = nn.LogSoftmax(dim=-1)
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        
        for rnn in [self.rnn1, self.rnn2]:
            for name, param in rnn.named_parameters():
                if 'bias' in name:
                    nn.init.constant_(param, 0.0)
                elif 'weight' in name:
                    nn.init.xavier_normal_(param)
        
#         for fc in [self.fc1, self.fc2]:
#             fc.weight.data.uniform_(-initrange, initrange)
#             fc.bias.data.zero_()

    
    def forward(self, text, lengths):
        emb = self.embedding(text)
        emb = self.dropout(emb)
        
        packed_emb = nn.utils.rnn.pack_padded_sequence(emb, lengths, batch_first=True)
        
        output1, (final_hidden_state, final_cell_state) = self.rnn1(packed_emb)
        
        x, lengths = nn.utils.rnn.pad_packed_sequence(output1, batch_first=True)
        x, _ = self.attn1(x, lengths)

        output2, (final_hidden_state, final_cell_state) = self.rnn2(output1)
        y, lengths = nn.utils.rnn.pad_packed_sequence(output2, batch_first=True)
        y, _ = self.attn2(y, lengths)
        
        z = torch.cat([x, y], dim=1)
        z = self.fc1(self.dropout(z))
        z = self.fc2(self.dropout(z))

        return self.softmax(z)
        

In [47]:
vocabulary_size = len(TEXT.vocab)
n_class = len(LABEL.vocab)
embedding_size = 100
hidden_counts = 75
n_layers = 2
bidirectional = True
dropout = 0.4

model = TextClassification(batch_size, vocabulary_size, embedding_size, hidden_counts, n_layers, bidirectional, dropout, n_class)

In [48]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.2059,  0.8924,  0.4946,  ..., -0.0663, -0.2220, -0.1561],
        [ 0.7893,  0.6285, -0.4647,  ...,  0.0823,  0.8793, -0.0854],
        [-0.3389,  0.2919,  0.2993,  ...,  0.2409,  0.2894, -0.6609]])

In [49]:
def training(model, iterator, optimizer, criterion):
    training_loss = 0
    training_accuracy = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        output = model(text, text_lengths).squeeze()
        
        loss = criterion(output, target)
        
        training_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()

        acc = num_corrects/len(batch)
        training_accuracy += acc.item()

    scheduler.step()
    
    return training_loss / len(iterator), training_accuracy / len(iterator)

def testing(model, iterator, optimizer, criterion):
    testing_loss = 0
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            loss = criterion(output, target)
            
            testing_loss += loss.item()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects/len(batch)
        
            testing_accuracy += acc.item()
            
    return testing_loss / len(iterator), testing_accuracy / len(iterator)

In [58]:
import time

n_epochs = 10
min_val_loss = float("inf")

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

for epoch in range(n_epochs):
    start_time = time.time()
    
    train_loss, train_acc = training(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = testing(model, valid_iterator, optimizer, criterion)
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.2f}%(train)')
    print(f'\tLoss: {val_loss:.4f}(valid)\t|\tAcc: {val_acc * 100:.2f}%(valid)')
    
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        torch.save(model.state_dict(), 'AG_news/model/saved_weights_attention_model.pt')

Epoch: 1  | time in 2 minutes, 23 seconds
	Loss: 0.3760(train)	|	Acc: 88.05%(train)
	Loss: 0.4266(valid)	|	Acc: 87.03%(valid)
Epoch: 2  | time in 2 minutes, 15 seconds
	Loss: 0.3181(train)	|	Acc: 90.30%(train)
	Loss: 0.4093(valid)	|	Acc: 87.41%(valid)
Epoch: 3  | time in 2 minutes, 13 seconds
	Loss: 0.2675(train)	|	Acc: 91.89%(train)
	Loss: 0.4329(valid)	|	Acc: 87.34%(valid)
Epoch: 4  | time in 2 minutes, 15 seconds
	Loss: 0.2439(train)	|	Acc: 92.54%(train)
	Loss: 0.3922(valid)	|	Acc: 87.67%(valid)
Epoch: 5  | time in 2 minutes, 5 seconds
	Loss: 0.2192(train)	|	Acc: 93.24%(train)
	Loss: 0.3996(valid)	|	Acc: 87.90%(valid)
Epoch: 6  | time in 2 minutes, 17 seconds
	Loss: 0.2062(train)	|	Acc: 93.76%(train)
	Loss: 0.4730(valid)	|	Acc: 86.74%(valid)
Epoch: 7  | time in 2 minutes, 15 seconds
	Loss: 0.1897(train)	|	Acc: 93.85%(train)
	Loss: 0.4310(valid)	|	Acc: 88.06%(valid)
Epoch: 8  | time in 2 minutes, 11 seconds
	Loss: 0.1739(train)	|	Acc: 94.53%(train)
	Loss: 0.4570(valid)	|	Acc: 87.78%(

In [59]:
testing_data=data.TabularDataset(path = 'AG_news/test.csv',format = 'csv',fields = fields,skip_header = True)

In [60]:
testing_iterator = data.BucketIterator(testing_data, batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True,
                                                           device=device)

In [61]:
path='AG_news/model/saved_weights_attention_model.pt'
model.load_state_dict(torch.load(path))

def predict(model, iterator):
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        # text = TEXT.preprocess(text)
        label = batch.label
        target = torch.autograd.Variable(label).long()
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects / len(batch)
            testing_accuracy += acc.item()
    
    return testing_accuracy / len(iterator)

In [62]:
test_acc = predict(model, testing_iterator)
print(f"Accuracy {test_acc * 100:.2f}")

Accuracy 87.32


In [63]:
print(model)

TextClassification(
  (dropout): Dropout(p=0.4, inplace=False)
  (embedding): Embedding(11607, 100)
  (rnn1): LSTM(100, 75, batch_first=True, bidirectional=True)
  (attn1): Attention()
  (rnn2): LSTM(150, 75, batch_first=True, bidirectional=True)
  (attn2): Attention()
  (fc1): Sequential(
    (0): Linear(in_features=300, out_features=300, bias=True)
    (1): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (fc2): Linear(in_features=300, out_features=4, bias=True)
  (softmax): LogSoftmax()
)
