# Text classification with a CNN in PyTorch

## Data

In [18]:
TRAIN_PATH = "data/text_classification/20newsgroups_train.tsv"
DEV_PATH = "data/text_classification/20newsgroups_dev.tsv"
TEST_PATH = "data/text_classification/20newsgroups_test.tsv"

In [19]:
from sklearn.datasets import fetch_20newsgroups

train = fetch_20newsgroups(subset="train")
label2idx = {label: idx for idx, label in enumerate(train.target_names)}

In [21]:
import csv
import sys
from torchtext.data import TabularDataset, Field, BucketIterator

csv.field_size_limit(sys.maxsize)

text = Field(sequential=True, tokenize="spacy")
label = Field(sequential=False, use_vocab=False, preprocessing=lambda x: label2idx[x])

train_data = TabularDataset(path=TRAIN_PATH, format='tsv', fields=[('label', label), ('text', text)])
dev_data = TabularDataset(path=DEV_PATH, format='tsv', fields=[('label', label), ('text', text)])
test_data = TabularDataset(path=TEST_PATH, format='tsv', fields=[('label', label), ('text', text)])

In [22]:
VOCAB_SIZE = 30000

text.build_vocab(train_data, max_size=VOCAB_SIZE)

In [23]:
BATCH_SIZE = 32
train_iter = BucketIterator(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
dev_iter = BucketIterator(dataset=dev_data, batch_size=BATCH_SIZE)
test_iter = BucketIterator(dataset=test_data, batch_size=BATCH_SIZE)

## Model

In [25]:
import torch.nn as nn
import torch.nn.functional as F


class CNNClassifier(nn.Module):

    def __init__(self, embedding_dim, filter_sizes, num_filters, vocab_size, output_size):
        super(CNNClassifier, self).__init__()
        
        # 1. Embedding Layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # 2. LSTM Layer
        self.cnn = nn.ModuleList([nn.Conv1d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes])

        # 3. Dense Layer
        self.hidden2out = nn.Linear(num_filters*len(filter_sizes), output_size)
        
        # Optional dropout layer
        self.dropout_layer = nn.Dropout(p=0.4)

    def forward(self, batch_text):

        embeddings = self.embeddings(batch_text)

        embeddings = embeddings.transpose(0,1)  # (batch, length, embed_dim)
        embeddings = embeddings.unsqueeze(1)    # (batch, channels, length, embed_dim)
        conv_out = [conv(embeddings) for conv in self.cnn]  # (batch, num_filters, output_length, 1)
        conv_out = [F.relu(t).squeeze(3) for t in conv_out]
        conv_out = [F.max_pool1d(t, t.size(2)).squeeze(2) for t in conv_out]
        conv_out = torch.cat(conv_out, 1)

        conv_out = self.dropout_layer(conv_out)
        final_output = self.hidden2out(conv_out)
        return final_output

## Training

In [26]:
import torch
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import precision_recall_fscore_support

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

def train(model, train_iter, dev_iter, batch_size, num_batches):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    max_epochs = 20
    loss_history = []
    patience = 3
    for epoch in range(max_epochs):

        total_loss = 0
        predictions, correct = [], []
        for batch in tqdm(train_iter, total=num_batches):
            optimizer.zero_grad()

            pred = model(batch.text.to(device))
            loss = criterion(pred, batch.label.to(device))
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            _, pred_indices = torch.max(pred, 1)
            predictions += list(pred_indices.cpu().numpy())
            correct += list(batch.label.cpu().numpy())

        print("=== Epoch", epoch, "===")
        print("Total training loss:", total_loss)
        print("Training performance:", precision_recall_fscore_support(correct, predictions))
        
        total_loss = 0
        predictions, correct = [], []
        for batch in dev_iter:

            pred = model(batch.text.to(device))
            loss = criterion(pred, batch.label.to(device))
            total_loss += loss.item()

            _, pred_indices = torch.max(pred, 1)
            pred_indices = list(pred_indices.cpu().numpy())
            predictions += pred_indices
            correct += list(batch.label.cpu().numpy())

        print("Total development loss:", total_loss)
        dev_stats = precision_recall_fscore_support(correct, predictions)
        print("Development performance:", dev_stats)
        
        if total_loss > max(loss_history): 
            fscore = dev_stats[2]
            path = f"model_state_{epoch}_{round(total_loss,2)}_{round(fscore,2)}"
            torch.save(model.state_dict(), path)
            
        if total_loss < max(loss_history[-patience:]):
            break

In [27]:
EMBEDDING_DIM = 300
NUM_FILTERS = 128
FILTER_SIZES = [3]
NUM_CLASSES = len(label2idx)
num_batches = int(len(train_data) / BATCH_SIZE)

classifier = CNNClassifier(EMBEDDING_DIM, FILTER_SIZES, NUM_FILTERS, VOCAB_SIZE+2, NUM_CLASSES)  

train(classifier.to(device), train_iter, dev_iter, BATCH_SIZE, num_batches)


HBox(children=(IntProgress(value=0, max=353), HTML(value='')))

KeyboardInterrupt: 

## Testing

In [29]:
from sklearn.metrics import classification_report

def test(model, state_path, test_iter, batch_size, num_batches):
    
    model.load_state_dict(torch.load(state_path))
    
    predictions, correct = [], []
    for batch in test_iter:

        pred = model(batch.text.to(device))
        _, pred_indices = torch.max(pred, 1)

        pred_indices = list(pred_indices.cpu().numpy())
        predictions += pred_indices
        correct += list(batch.label.cpu().numpy())

    print(classification_report(correct, predictions))

In [None]:
STATE_PATH = ""
num_batches = int(len(test_data) / BATCH_SIZE)

classifier = CNNClassifier(EMBEDDING_DIM, FILTER_SIZES, NUM_FILTERS, VOCAB_SIZE+2, NUM_CLASSES)  

train(classifier.to(device), STATE_PATH, test_iter, BATCH_SIZE, num_batches)