In [1]:
import torch
import torchtext
from torchtext import data

max_seq_len = 50
TEXT = data.Field(tokenize="spacy", batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)

I0401 11:40:29.888482 140058632382272 file_utils.py:32] TensorFlow version 2.1.0 available.
I0401 11:40:29.889459 140058632382272 file_utils.py:39] PyTorch version 1.4.0 available.
I0401 11:40:30.201050 140058632382272 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
fields = [('label', LABEL), (None, None), ('text',TEXT)]

In [4]:
training_data=data.TabularDataset(path = 'AG_news/train.csv',format = 'csv',fields = fields,skip_header = True)

In [5]:
print(vars(training_data.examples[0]))

{'label': '3', 'text': ['Reuters', '-', 'Short', '-', 'sellers', ',', 'Wall', 'Street', "'s", 'dwindling\\band', 'of', 'ultra', '-', 'cynics', ',', 'are', 'seeing', 'green', 'again', '.']}


In [6]:
train_data, valid_data = training_data.split(split_ratio=0.1)

In [7]:
#initialize glove embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.300d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
# print(TEXT.vocab.stoi)   

I0401 11:41:13.653469 140058632382272 vocab.py:431] Loading vectors from .vector_cache/glove.6B.300d.txt.pt


Size of TEXT vocabulary: 11451
Size of LABEL vocabulary: 4
[('the', 17637), (',', 14374), ('.', 13166), ('-', 9804), ('to', 9667), ('a', 9448), ('of', 8929), ('in', 7588), ('and', 6643), ('on', 4687)]


In [8]:
batch_size = 64

train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data), batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True,
                                                           device=device)

In [45]:
# Create neural network representation
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class RCNNTextClassification(nn.Module):
    def __init__(self, vocabulary_size, embedding_size, hidden_size_lstm, hidden_size_linear,
                 num_layers, bidirectional,
                 max_seq_len, dropout, num_class):
        super().__init__()
        self.embedding_size = embedding_size
        self.hidden_size_lstm = hidden_size_lstm
        self.hidden_size_linear = hidden_size_linear
        self.max_seq_len = max_seq_len
        
        self.embedding = nn.Embedding(vocabulary_size, embedding_size)
        
        self.rnn = nn.LSTM(self.embedding_size, self.hidden_size_lstm, num_layers, bidirectional=bidirectional,
                          dropout=dropout, batch_first=True)
        
        self.dropout = nn.Dropout(dropout)
        
        if bidirectional:
            self.W = nn.Linear(self.embedding_size + 2*self.hidden_size_lstm, self.hidden_size_linear)
            
        else:
            self.W = nn.Linear(self.embedding_size + self.hidden_size_lstm, self.hidden_size_linear)
            
        self.tanh = nn.Tanh()
        self.fc = nn.Linear(self.hidden_size_linear, num_class)
        
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, text, text_lengths):
        emb = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(emb, text_lengths, batch_first=True)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        x, lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        input_features = torch.cat([x, emb], 2)
        linear_output = self.tanh(self.W(input_features))
        
        linear_output = linear_output.permute(0,2,1) # Reshaping fot max_pool
        
        max_out_features = F.max_pool1d(linear_output, linear_output.shape[2]).squeeze(2)
        
        max_out_features = self.dropout(max_out_features)
        final_out = self.fc(max_out_features)
        
        return self.softmax(final_out)
        

In [51]:
vocabulary_size = len(TEXT.vocab)
n_class = len(LABEL.vocab)
embedding_size = 300
hidden_size_lstm = 75
hidden_size_linear = 75
num_layers = 1
bidirectional = True
dropout = 0.6

model = RCNNTextClassification(vocabulary_size, embedding_size, hidden_size_lstm, hidden_size_linear, num_layers, bidirectional,
                 max_seq_len, dropout, n_class)

  "num_layers={}".format(dropout, num_layers))


In [52]:
model

RCNNTextClassification(
  (embedding): Embedding(11451, 300)
  (rnn): LSTM(300, 75, batch_first=True, dropout=0.6, bidirectional=True)
  (dropout): Dropout(p=0.6, inplace=False)
  (W): Linear(in_features=450, out_features=75, bias=True)
  (tanh): Tanh()
  (fc): Linear(in_features=75, out_features=4, bias=True)
  (softmax): LogSoftmax()
)

In [53]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 4.6560e-02,  2.1318e-01, -7.4364e-03,  ...,  9.0611e-03,
         -2.0989e-01,  5.3913e-02],
        ...,
        [-5.3067e-01,  3.0893e-02,  3.2893e-01,  ...,  3.0950e-01,
          3.6670e-01, -2.9955e-01],
        [-4.1813e-01,  4.8057e-01,  4.9454e-01,  ...,  4.2032e-04,
         -3.9901e-01,  1.8607e-01],
        [-4.8421e-01,  2.5875e-01, -9.6700e-02,  ..., -7.2311e-01,
          2.3865e-01,  3.1459e-01]])

In [54]:
def training(model, iterator, optimizer, criterion):
    training_loss = 0
    training_accuracy = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        output = model(text, text_lengths).squeeze()
        
        loss = criterion(output, target)
        
        training_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()

        acc = num_corrects/len(batch)
        training_accuracy += acc.item()

    scheduler.step()
    
    return training_loss / len(iterator), training_accuracy / len(iterator)

def testing(model, iterator, optimizer, criterion):
    testing_loss = 0
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            loss = criterion(output, target)
            
            testing_loss += loss.item()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects/len(batch)
        
            testing_accuracy += acc.item()
            
    return testing_loss / len(iterator), testing_accuracy / len(iterator)

In [55]:
import time

n_epochs = 15
min_val_loss = float("inf")
path='AG_news/model/saved_weights_rcnn.pt'

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

for epoch in range(n_epochs):
    start_time = time.time()
    
    train_loss, train_acc = training(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = testing(model, valid_iterator, optimizer, criterion)
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.2f}%(train)')
    print(f'\tLoss: {val_loss:.4f}(valid)\t|\tAcc: {val_acc * 100:.2f}%(valid)')
    
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        torch.save(model.state_dict(), path)

Epoch: 1  | time in 1 minutes, 48 seconds
	Loss: 0.6566(train)	|	Acc: 75.63%(train)
	Loss: 0.4725(valid)	|	Acc: 83.16%(valid)
Epoch: 2  | time in 1 minutes, 42 seconds
	Loss: 0.5056(train)	|	Acc: 82.33%(train)
	Loss: 0.4334(valid)	|	Acc: 84.68%(valid)
Epoch: 3  | time in 1 minutes, 35 seconds
	Loss: 0.4509(train)	|	Acc: 84.25%(train)
	Loss: 0.4262(valid)	|	Acc: 85.07%(valid)
Epoch: 4  | time in 1 minutes, 37 seconds
	Loss: 0.4243(train)	|	Acc: 84.96%(train)
	Loss: 0.4171(valid)	|	Acc: 85.37%(valid)
Epoch: 5  | time in 1 minutes, 32 seconds
	Loss: 0.3846(train)	|	Acc: 86.39%(train)
	Loss: 0.3960(valid)	|	Acc: 86.21%(valid)
Epoch: 6  | time in 1 minutes, 37 seconds
	Loss: 0.3616(train)	|	Acc: 87.27%(train)
	Loss: 0.3951(valid)	|	Acc: 86.33%(valid)
Epoch: 7  | time in 1 minutes, 38 seconds
	Loss: 0.3373(train)	|	Acc: 88.24%(train)
	Loss: 0.3976(valid)	|	Acc: 86.25%(valid)
Epoch: 8  | time in 1 minutes, 35 seconds
	Loss: 0.3175(train)	|	Acc: 88.71%(train)
	Loss: 0.3836(valid)	|	Acc: 86.64%

In [56]:
testing_data=data.TabularDataset(path = 'AG_news/test.csv',format = 'csv',fields = fields,skip_header = True)

In [57]:
testing_iterator = data.BucketIterator(testing_data, batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True,
                                                           device=device)

In [58]:
model.load_state_dict(torch.load(path))

def predict(model, iterator):
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        # text = TEXT.preprocess(text)
        label = batch.label
        target = torch.autograd.Variable(label).long()
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects / len(batch)
            testing_accuracy += acc.item()
    
    return testing_accuracy / len(iterator)

In [59]:
test_acc = predict(model, testing_iterator)
print(f"Accuracy {test_acc * 100:.2f}")

Accuracy 86.58
