In [1]:
import torch
import torchtext
from torchtext import data

TEXT = data.Field(tokenize="spacy", batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)

I0327 14:43:52.492445 139798818404160 file_utils.py:32] TensorFlow version 2.1.0 available.
I0327 14:43:52.493355 139798818404160 file_utils.py:39] PyTorch version 1.4.0 available.
I0327 14:43:53.076410 139798818404160 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
fields = [('label', LABEL), (None, None), ('text',TEXT)]

In [4]:
training_data=data.TabularDataset(path = 'AG_news/train.csv',format = 'csv',fields = fields,skip_header = True)

In [5]:
print(vars(training_data.examples[0]))

{'label': '3', 'text': ['Reuters', '-', 'Short', '-', 'sellers', ',', 'Wall', 'Street', "'s", 'dwindling\\band', 'of', 'ultra', '-', 'cynics', ',', 'are', 'seeing', 'green', 'again', '.']}


In [6]:
train_data, valid_data = training_data.split(split_ratio=0.1)

In [7]:
#initialize glove embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
# print(TEXT.vocab.stoi)   

I0327 14:44:36.156149 139798818404160 vocab.py:431] Loading vectors from .vector_cache/glove.6B.100d.txt.pt


Size of TEXT vocabulary: 11540
Size of LABEL vocabulary: 4
[('the', 17397), (',', 14584), ('.', 13170), ('-', 10010), ('to', 9628), ('a', 9619), ('of', 8837), ('in', 7572), ('and', 6630), ('on', 4786)]


In [8]:
batch_size = 64

train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data), batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True,
                                                           device=device)

In [9]:
# Create neural network representation
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

In [138]:
class TextClassification(nn.Module):
    def __init__(self, batch_size, vocabulary_size, embedding_size, hidden_size, num_layers, bidirectional, dropout, num_class):
        super().__init__()
        self.batch_size = batch_size
        self.dropout = nn.Dropout(p=dropout)
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocabulary_size, embedding_size)
        self.lstm_layer = num_layers
        
        self.rnn = nn.LSTM(self.embedding.embedding_dim, self.hidden_size, bidirectional=bidirectional, batch_first=True)
        
        if bidirectional:
            self.W_s1 = nn.Linear(self.hidden_size*2, 350)
            self.W_s2 = nn.Linear(350, 30)
            self.fc1 = nn.Linear(30*self.hidden_size*2,
                                               2000)
            
            self.fc2 = nn.Linear(2000, num_class)
        
        
        else:
            self.W_s1 = nn.Linear(self.hidden_size, 350)
            self.W_s2 = nn.Linear(350, 30)
            self.fc1 = nn.Linear(30*self.hidden_size,
                                               2000)
            self.fc2 = nn.Linear(2000, num_class)
            
        self.softmax = nn.LogSoftmax(dim=-2)
#         self.init_weights()
        
#     def init_weights(self):
#         initrange = 0.5
        
#         for rnn in [self.rnn]:
#             for name, param in rnn.named_parameters():
#                 if 'bias' in name:
#                     nn.init.constant_(param, 0.0)
#                 elif 'weight' in name:
#                     nn.init.xavier_normal_(param)
        
#         for fc in [self.fc1, self.fc2]:
#             fc.weight.data.uniform_(-initrange, initrange)
#             fc.bias.data.zero_()

    def forward(self, text, lengths):
        emb = self.embedding(text)
        emb = self.dropout(emb)
        
        packed_emb = nn.utils.rnn.pack_padded_sequence(emb, lengths, batch_first=True)
        
        output, (final_hidden_state, final_cell_state) = self.rnn(packed_emb)
        x, lengths = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        
        attn_weight_mat = self.W_s2(F.tanh(self.W_s1(x)))
        attn_weight_mat = attn_weight_mat.permute(0, 2, 1)
        attn_weight_mat = F.softmax(attn_weight_mat, dim=2)
        
        hidden_mat = torch.bmm(attn_weight_mat, x)
         
        fc_1 = self.fc1(hidden_mat.view(-1, hidden_mat.size()[1]*hidden_mat.size()[2]))
        fc_2 = self.fc2(self.dropout(fc_1))
        return self.softmax(fc_2)
        

In [139]:
vocabulary_size = len(TEXT.vocab)
n_class = len(LABEL.vocab)
embedding_size = 100
hidden_counts = 75
n_layers = 2
bidirectional = True
dropout = 0.2

model = TextClassification(batch_size, vocabulary_size, embedding_size, hidden_counts, n_layers, bidirectional, dropout, n_class)

In [140]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.7893,  0.6285, -0.4647,  ...,  0.0823,  0.8793, -0.0854],
        [ 0.0091,  0.2810,  0.7356,  ..., -0.7508,  0.8967, -0.7631],
        [ 0.6916,  0.7388,  0.4807,  ..., -0.0513,  0.6089, -0.1047]])

In [141]:
def training(model, iterator, optimizer, criterion):
    training_loss = 0
    training_accuracy = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        output = model(text, text_lengths).squeeze()
        
        loss = criterion(output, target)
        
        training_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()

        acc = num_corrects/len(batch)
        training_accuracy += acc.item()

    scheduler.step()
    
    return training_loss / len(iterator), training_accuracy / len(iterator)

def testing(model, iterator, optimizer, criterion):
    testing_loss = 0
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            loss = criterion(output, target)
            
            testing_loss += loss.item()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects/len(batch)
        
            testing_accuracy += acc.item()
            
    return testing_loss / len(iterator), testing_accuracy / len(iterator)

In [142]:
import time

n_epochs = 5
min_val_loss = float("inf")

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)
path='AG_news/model/saved_weights_self_attention_model.pt'

for epoch in range(n_epochs):
    start_time = time.time()
    
    train_loss, train_acc = training(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = testing(model, valid_iterator, optimizer, criterion)
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.2f}%(train)')
    print(f'\tLoss: {val_loss:.4f}(valid)\t|\tAcc: {val_acc * 100:.2f}%(valid)')
    
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        torch.save(model.state_dict(), path)

Epoch: 1  | time in 2 minutes, 0 seconds
	Loss: 0.8056(train)	|	Acc: 70.34%(train)
	Loss: 0.5948(valid)	|	Acc: 79.41%(valid)
Epoch: 2  | time in 1 minutes, 54 seconds
	Loss: 0.5125(train)	|	Acc: 82.58%(train)
	Loss: 0.5064(valid)	|	Acc: 82.50%(valid)
Epoch: 3  | time in 1 minutes, 57 seconds
	Loss: 0.3718(train)	|	Acc: 86.98%(train)
	Loss: 0.4195(valid)	|	Acc: 85.82%(valid)
Epoch: 4  | time in 2 minutes, 1 seconds
	Loss: 0.3050(train)	|	Acc: 89.63%(train)
	Loss: 0.3903(valid)	|	Acc: 86.52%(valid)
Epoch: 5  | time in 1 minutes, 58 seconds
	Loss: 0.2631(train)	|	Acc: 90.77%(train)
	Loss: 0.3949(valid)	|	Acc: 87.04%(valid)


In [143]:
testing_data=data.TabularDataset(path = 'AG_news/test.csv',format = 'csv',fields = fields,skip_header = True)

In [144]:
testing_iterator = data.BucketIterator(testing_data, batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True,
                                                           device=device)

In [145]:
model.load_state_dict(torch.load(path))

def predict(model, iterator):
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        # text = TEXT.preprocess(text)
        label = batch.label
        target = torch.autograd.Variable(label).long()
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects / len(batch)
            testing_accuracy += acc.item()
    
    return testing_accuracy / len(iterator)

In [146]:
test_acc = predict(model, testing_iterator)
print(f"Accuracy {test_acc * 100:.2f}")

Accuracy 86.37


In [147]:
print(model)

TextClassification(
  (dropout): Dropout(p=0.2, inplace=False)
  (embedding): Embedding(11540, 100)
  (rnn): LSTM(100, 75, batch_first=True, bidirectional=True)
  (W_s1): Linear(in_features=150, out_features=350, bias=True)
  (W_s2): Linear(in_features=350, out_features=30, bias=True)
  (fc1): Linear(in_features=4500, out_features=2000, bias=True)
  (fc2): Linear(in_features=2000, out_features=4, bias=True)
  (softmax): LogSoftmax()
)
