# 8. Convolutional Neural Networks

I recommend you take a look at these material first.

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture13-CNNs.pdf
* http://www.aclweb.org/anthology/D14-1181
* https://github.com/Shawn1993/cnn-text-classification-pytorch
* http://cogcomp.org/Data/QA/QC/

In [1]:
import os
import mindspore
from mindspore import nn, Tensor, ops, Parameter
import random
import numpy as np
import re
from mindnlp.modules import Accumulator
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

  from tqdm.autonotebook import tqdm


In [2]:
gpu = '0'
# 设置使用哪些显卡进行训练
os.environ["CUDA_VISIBLE_DEVICES"] = gpu

In [3]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch

    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [4]:
def pad_to_batch(batch):
    x, y = zip(*batch)
    max_x = max([s.shape[1] for s in x])
    x_p = []
    for i in range(len(batch)):
        if x[i].shape[1] < max_x:
            x_p.append(ops.cat([x[i], Parameter(Tensor([word2index['<PAD>']] * (max_x - x[i].shape[1]))).view(1, -1)], 1))
        else:
            x_p.append(x[i])
    return ops.cat(x_p), ops.cat(y).view(-1)

In [5]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w]
                    if word2index.get(w) is not None
                    else word2index["<UNK>"], seq))
    sequence = Tensor(idxs, dtype=mindspore.int64)
    return sequence

## Data load & Preprocessing

### TREC question dataset(http://cogcomp.org/Data/QA/QC/)

Task involves
classifying a question into 6 question
types (whether the question is about person,
location, numeric information, etc.)

In [6]:
data = open('../dataset/train_5500.label.txt', 'r', encoding='latin-1').readlines()

In [7]:
data = [[d.split(':')[1][:-1], d.split(':')[0]] for d in data]

In [8]:
X, y = list(zip(*data))
X = list(X)

### Num masking 

It reduces the search space. ex. my birthday is 12.22 ==> my birthday is ##.##

In [9]:
for i, x in enumerate(X):
    X[i] = re.sub('\d', '#', x).split()

### Build Vocab 

In [10]:
vocab = list(set(flatten(X)))

In [11]:
len(vocab)

9117

In [12]:
len(set(y))  # num of class

6

In [13]:
word2index = {'<PAD>': 0, '<UNK>': 1}

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)

index2word = {v: k for k, v in word2index.items()}

target2index = {}

for cl in set(y):
    if target2index.get(cl) is None:
        target2index[cl] = len(target2index)

index2target = {v: k for k, v in target2index.items()}

In [14]:
X_p, y_p = [], []
for pair in zip(X, y):
    X_p.append(prepare_sequence(pair[0], word2index).view(1, -1))
    y_p.append(Parameter(Tensor([target2index[pair[1]]], dtype=mindspore.int64)).view(1, -1))

data_p = list(zip(X_p, y_p))
random.shuffle(data_p)

train_data = data_p[: int(len(data_p) * 0.9)]
test_data = data_p[int(len(data_p) * 0.9):]

### Load Pretrained word vector

you can download pretrained word vector from here https://github.com/mmihaltz/word2vec-GoogleNews-vectors 

In [15]:
import gensim

In [17]:
model = gensim.models.KeyedVectors.load_word2vec_format('../dataset/GoogleNews-vectors-negative300.bin', binary=True)

In [18]:
len(model.index2word)

3000000

In [19]:
pretrained = []

for key in word2index.keys():
    try:
        pretrained.append(model[word2index[key]])
    except Exception:
        pretrained.append(np.random.randn(300))

pretrained_vectors = np.vstack(pretrained)

## Modeling 

<img src="../images/08.cnn-for-text-architecture.png">
<center>borrowed image from http://www.aclweb.org/anthology/D14-1181</center>

In [20]:
class CNNClassifier(nn.Cell):

    def __init__(self, vocab_size, embedding_dim, output_size, kernel_dim=100, kernel_sizes=(3, 4, 5), dropout=0.5):
        super(CNNClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.CellList([nn.Conv2d(1, kernel_dim, (K, embedding_dim), has_bias=True, pad_mode='valid') for K in kernel_sizes])

        # kernal_size = (K,D)
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Dense(len(kernel_sizes) * kernel_dim, output_size)

    def init_weights(self, pretrained_word_vectors, is_static=False):
        self.embedding.embedding_table = Parameter(Tensor(pretrained_word_vectors, dtype=mindspore.float32))
        if is_static:
            self.embedding.embedding_table.requires_grad = False

    def construct(self, inputs, is_training=False):
        inputs = self.embedding(inputs).unsqueeze(1)  # (B,1,T,D)
        inputs = [ops.relu(conv(inputs)).squeeze(3) for conv in self.convs]  # [(N,Co,W), ...]*len(Ks)
        inputs = [nn.MaxPool1d(i.shape[2])(i).squeeze(2) for i in inputs]  # [(N,Co), ...]*len(Ks)

        concated = ops.cat(inputs, 1)

        if is_training:
            concated = self.dropout(concated)  # (N,len(Ks)*Co)
        out = self.fc(concated)
        return ops.log_softmax(out, 1)

## Train 

It takes for a while if you use just cpu.

In [21]:
EPOCH = 5
BATCH_SIZE = 50
KERNEL_SIZES = [3]
KERNEL_DIM = 100
LR = 0.001

In [22]:
model = CNNClassifier(len(word2index), 300, len(target2index), KERNEL_DIM, KERNEL_SIZES)
model.init_weights(pretrained_vectors)  # initialize embedding matrix using pretrained vectors

loss_function = nn.CrossEntropyLoss()
optimizer = nn.Adam(model.trainable_params(), learning_rate=LR)

In [23]:
accumulate_step = 2
accumulator = Accumulator(optimizer, accumulate_step)


def forward_fn(inputs, targets, is_training):
    """Forward function"""
    preds = model(inputs, is_training)
    targets = targets.astype(mindspore.int32)
    loss = loss_function(preds, targets)
    return loss / accumulate_step


# Get gradient function
grad_fn = mindspore.value_and_grad(forward_fn, None, model.trainable_params())


# Define function of one-step training
def train_step(inputs, targets, is_training):
    """Training steps"""
    loss, grads = grad_fn(inputs, targets, is_training)
    loss = ops.depend(loss, accumulator(grads))
    return loss

In [26]:
for epoch in range(EPOCH):
    losses = []
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        inputs, targets = pad_to_batch(batch)

        loss = train_step(inputs, targets, True)
        losses.append(loss.asnumpy().item(0) * accumulate_step)

        if i % 100 == 0:
            print("[%d/%d] mean_loss : %0.2f" % (epoch, EPOCH, np.mean(losses)))
            losses = []

[0/5] mean_loss : 1.79
[1/5] mean_loss : 0.33
[2/5] mean_loss : 0.08
[3/5] mean_loss : 0.04
[4/5] mean_loss : 0.02


## Test 

In [27]:
accuracy = 0

In [28]:
for test in test_data:
    # KERNEL_SIZES = [min(4, test[0].shape[1]), min(5, test[0].shape[1])]
    # model = CNNClassifier(len(word2index), 300, len(target2index), KERNEL_DIM, KERNEL_SIZES)
    value = model(test[0])
    pred = ops.max(value, 1)[1]
    pred = pred.asnumpy().item(0)
    target = test[1]
    target = target.asnumpy().item(0)
    if pred == target:
        accuracy += 1

print(accuracy / len(test_data) * 100)

98.53479853479854


## Further topics 

* <a href="https://arxiv.org/pdf/1508.06615.pdf">Character-Aware Neural Language Models</a>
* <a href="https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf">Character level CNN for text classification</a>

## Suggested Reading

* https://blog.statsbot.co/text-classifier-algorithms-in-machine-learning-acc115293278
* <a href="https://arxiv.org/pdf/1607.01759">Bag of Tricks for Efficient Text Classification</a>
* <a href="https://arxiv.org/pdf/1708.02657">Which Encoding is the Best for Text Classification in Chinese, English, Japanese and Korean?</a>