In [104]:
import torch
import torchtext
from torchtext import data

max_seq_len = 50
TEXT = data.Field(tokenize="spacy", batch_first=True, include_lengths=True, fix_length=max_seq_len)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)

In [105]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [106]:
fields = [('label', LABEL), (None, None), ('text',TEXT)]

In [107]:
training_data=data.TabularDataset(path = 'AG_news/train.csv',format = 'csv',fields = fields,skip_header = True)

In [108]:
print(vars(training_data.examples[0]))

{'label': '3', 'text': ['Reuters', '-', 'Short', '-', 'sellers', ',', 'Wall', 'Street', "'s", 'dwindling\\band', 'of', 'ultra', '-', 'cynics', ',', 'are', 'seeing', 'green', 'again', '.']}


In [109]:
train_data, valid_data = training_data.split(split_ratio=0.1)

In [110]:
#initialize glove embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.300d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
# print(TEXT.vocab.stoi)   

I0401 08:49:42.976831 140345299633984 vocab.py:431] Loading vectors from .vector_cache/glove.6B.300d.txt.pt


Size of TEXT vocabulary: 11553
Size of LABEL vocabulary: 4
[('the', 17355), (',', 14475), ('.', 13211), ('-', 9804), ('a', 9516), ('to', 9420), ('of', 8873), ('in', 7689), ('and', 6488), ('on', 4694)]


In [111]:
batch_size = 64

train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data), batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True,
                                                           device=device)

In [112]:
# Create neural network representation
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class CNNTextClassification(nn.Module):
    def __init__(self, vocabulary_size, embedding_size, max_seq_len, out_channels,
                 kernel_heights, dropout, num_class):
        super().__init__()
        self.out_channels = out_channels
        self.kernel_heights = kernel_heights
        self.embedding_size = embedding_size
        self.max_seq_len = max_seq_len
        
        self.embedding = nn.Embedding(vocabulary_size, embedding_size)
        
        self.conv1 = nn.Sequential(nn.Conv1d(in_channels=self.embedding_size, out_channels=self.out_channels,
                               kernel_size=self.kernel_heights[0]),
                                   nn.ReLU(),
                                  nn.MaxPool1d(self.max_seq_len - self.kernel_heights[0]+1))
        
        self.conv2 = nn.Sequential(nn.Conv1d(in_channels=self.embedding_size, out_channels=self.out_channels,
                               kernel_size=self.kernel_heights[1]),
                                   nn.ReLU(),
                                  nn.MaxPool1d(self.max_seq_len - self.kernel_heights[1]+1))
        
        self.conv3 = nn.Sequential(nn.Conv1d(in_channels=self.embedding_size, out_channels=self.out_channels,
                               kernel_size=self.kernel_heights[2]),
                                   nn.ReLU(),
                                  nn.MaxPool1d(self.max_seq_len - self.kernel_heights[2]+1))
        
        self.dropout = nn.Dropout(dropout)
        
        self.fc = nn.Linear(len(self.kernel_heights) * out_channels, num_class)
        
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, text, text_lengths):
        emb = self.embedding(text).permute(0, 2, 1)
        
        conv_out1 = self.conv1(emb).squeeze(2)
        conv_out2 = self.conv2(emb).squeeze(2)
        conv_out3 = self.conv3(emb).squeeze(2)
        
        all_out = torch.cat((conv_out1, conv_out2, conv_out3), 1)
        final_feature_map = self.dropout(all_out)
        
        final_out = self.fc(final_feature_map)
        
        return self.softmax(final_out)
        

In [113]:
vocabulary_size = len(TEXT.vocab)
n_class = len(LABEL.vocab)
embedding_size = 300
out_channels = 100
kernel_heights = [3, 4, 5]
dropout = 0.4

model = CNNTextClassification(vocabulary_size, embedding_size, max_seq_len,
                              out_channels, kernel_heights, dropout, n_class)

In [114]:
model

CNNTextClassification(
  (embedding): Embedding(11553, 300)
  (conv1): Sequential(
    (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=48, stride=48, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=47, stride=47, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=46, stride=46, padding=0, dilation=1, ceil_mode=False)
  )
  (dropout): Dropout(p=0.4, inplace=False)
  (fc): Linear(in_features=300, out_features=4, bias=True)
  (softmax): LogSoftmax()
)

In [115]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [-0.4171,  0.2261, -0.0720,  ..., -0.8584,  0.2861,  0.1984],
        [-0.2022,  0.2169, -0.1251,  ..., -0.2296, -0.1567,  0.1279],
        [ 0.4281,  0.4327,  0.0331,  ..., -0.3796,  0.2552,  0.0976]])

In [116]:
def training(model, iterator, optimizer, criterion):
    training_loss = 0
    training_accuracy = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        output = model(text, text_lengths).squeeze()
        
        loss = criterion(output, target)
        
        training_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()

        acc = num_corrects/len(batch)
        training_accuracy += acc.item()

    scheduler.step()
    
    return training_loss / len(iterator), training_accuracy / len(iterator)

def testing(model, iterator, optimizer, criterion):
    testing_loss = 0
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        target = batch.label
        target = torch.autograd.Variable(target).long()
        
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            loss = criterion(output, target)
            
            testing_loss += loss.item()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects/len(batch)
        
            testing_accuracy += acc.item()
            
    return testing_loss / len(iterator), testing_accuracy / len(iterator)

In [118]:
import time

n_epochs = 15
min_val_loss = float("inf")
path='AG_news/model/saved_weights_cnn.pt'

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

for epoch in range(n_epochs):
    start_time = time.time()
    
    train_loss, train_acc = training(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = testing(model, valid_iterator, optimizer, criterion)
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.2f}%(train)')
    print(f'\tLoss: {val_loss:.4f}(valid)\t|\tAcc: {val_acc * 100:.2f}%(valid)')
    
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        torch.save(model.state_dict(), path)

Epoch: 1  | time in 1 minutes, 21 seconds
	Loss: 0.9035(train)	|	Acc: 62.52%(train)
	Loss: 0.7674(valid)	|	Acc: 73.22%(valid)
Epoch: 2  | time in 1 minutes, 21 seconds
	Loss: 0.8207(train)	|	Acc: 66.64%(train)
	Loss: 0.7223(valid)	|	Acc: 75.11%(valid)
Epoch: 3  | time in 1 minutes, 24 seconds
	Loss: 0.7772(train)	|	Acc: 68.95%(train)
	Loss: 0.7072(valid)	|	Acc: 76.05%(valid)
Epoch: 4  | time in 1 minutes, 24 seconds
	Loss: 0.7483(train)	|	Acc: 70.00%(train)
	Loss: 0.6821(valid)	|	Acc: 77.07%(valid)
Epoch: 5  | time in 1 minutes, 22 seconds
	Loss: 0.7222(train)	|	Acc: 71.54%(train)
	Loss: 0.7072(valid)	|	Acc: 74.87%(valid)
Epoch: 6  | time in 1 minutes, 26 seconds
	Loss: 0.6992(train)	|	Acc: 72.37%(train)
	Loss: 0.6687(valid)	|	Acc: 77.43%(valid)
Epoch: 7  | time in 1 minutes, 23 seconds
	Loss: 0.6664(train)	|	Acc: 74.88%(train)
	Loss: 0.6474(valid)	|	Acc: 77.86%(valid)
Epoch: 8  | time in 1 minutes, 22 seconds
	Loss: 0.6397(train)	|	Acc: 76.43%(train)
	Loss: 0.6401(valid)	|	Acc: 78.34%

In [119]:
testing_data=data.TabularDataset(path = 'AG_news/test.csv',format = 'csv',fields = fields,skip_header = True)

In [120]:
testing_iterator = data.BucketIterator(testing_data, batch_size=batch_size,
                                                           sort_key=lambda x: len(x.text),
                                                           sort_within_batch=True,
                                                           device=device)

In [121]:
model.load_state_dict(torch.load(path))

def predict(model, iterator):
    testing_accuracy = 0
    model.eval()
    
    for batch in iterator:
        text, text_lengths = batch.text
        # text = TEXT.preprocess(text)
        label = batch.label
        target = torch.autograd.Variable(label).long()
        with torch.no_grad():
            output = model(text, text_lengths).squeeze()
            num_corrects = (torch.max(output, 1)[1].view(target.size()).data == target.data).float().sum()
            acc = num_corrects / len(batch)
            testing_accuracy += acc.item()
    
    return testing_accuracy / len(iterator)

In [122]:
test_acc = predict(model, testing_iterator)
print(f"Accuracy {test_acc * 100:.2f}")

Accuracy 79.84
