# 4. Word Window Classification and Neural Networks 

I recommend you take a look at these material first.

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture4.pdf
* https://en.wikipedia.org/wiki/Named-entity_recognition

In [1]:
import os
import mindspore
from mindspore import nn, Tensor, ops
import nltk
import random
import numpy as np
flatten = lambda l: [item for sublist in l for item in sublist]
from sklearn.metrics import classification_report
from mindnlp.modules import Accumulator
random.seed(1024)

  from tqdm.autonotebook import tqdm


You also need <a href="http://sklearn-crfsuite.readthedocs.io/en/latest/index.html">sklearn_crfsuite</a> latest version for print confusion matrix

In [2]:
print(mindspore.__version__)
print(nltk.__version__)

2.0.0.20230623
3.7


In [3]:
gpu = '0'
# 设置使用哪些显卡进行训练
os.environ["CUDA_VISIBLE_DEVICES"] = gpu

In [4]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch

    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [5]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w]
                    if word2index.get(w) is not None
                    else word2index["<UNK>"], seq))
    sequence = Tensor(idxs, dtype=mindspore.int64)
    return sequence


def prepare_word(word, word2index):
    return Tensor([word2index[word]], dtype=mindspore.int64) \
        if word2index.get(word) is not None \
        else Tensor([word2index["<UNK>"]], dtype=mindspore.int64)


def prepare_tag(tag, tag2index):
    return Tensor([tag2index[tag]], dtype=mindspore.int64)

## Data load and Preprocessing 

CoNLL-2002 Shared Task: Language-Independent Named Entity Recognition <br>
https://www.clips.uantwerpen.be/conll2002/ner/

In [6]:
corpus = nltk.corpus.conll2002.iob_sents()

In [7]:
data = []
for cor in corpus:
    sent, _, tag = list(zip(*cor))
    data.append([sent, tag])

In [8]:
print(len(data))
print(data[0])

35651
[('Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.'), ('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')]


### Build Vocab

In [9]:
sents, tags = list(zip(*data))
vocab = list(set(flatten(sents)))
tagset = list(set(flatten(tags)))

In [10]:
word2index = {'<UNK>' : 0, '<DUMMY>' : 1}  # dummy token is for start or end of sentence
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
index2word = {v: k for k, v in word2index.items()}

tag2index = {}
for tag in tagset:
    if tag2index.get(tag) is None:
        tag2index[tag] = len(tag2index)
index2tag = {v: k for k, v in tag2index.items()}

### Prepare data

<center>Example : Classify 'Paris' in the context of this sentence with window length 2</center>

<img src="../images/04.window-data.png">

<center>borrowed image from http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture4.pdf</center>

In [11]:
WINDOW_SIZE = 2
windows = []

In [12]:
for sample in data:
    dummy = ['<DUMMY>'] * WINDOW_SIZE
    window = list(nltk.ngrams(dummy + list(sample[0]) + dummy, WINDOW_SIZE * 2 + 1))
    windows.extend([[list(window[i]), sample[1][i]] for i in range(len(sample[0]))])

In [13]:
windows[0]

[['<DUMMY>', '<DUMMY>', 'Sao', 'Paulo', '('], 'B-LOC']

In [14]:
len(windows)

678377

In [15]:
random.shuffle(windows)

train_data = windows[:int(len(windows) * 0.9)]
test_data = windows[int(len(windows) * 0.9):]

## Modeling 

<img src="../images/04.window-classifier-architecture.png">
<center>borrowed image from http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture4.pdf</center>

In [22]:
class WindowClassifier(nn.Cell):
    def __init__(self, vocab_size, embedding_size, window_size, hidden_size, output_size):

        super(WindowClassifier, self).__init__()

        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.h_dense1 = nn.Dense(embedding_size * (window_size * 2 + 1), hidden_size)
        self.h_dense2 = nn.Dense(hidden_size, hidden_size)
        self.o_layer = nn.Dense(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(axis=1)
        self.dropout = nn.Dropout(p=0.3)

    def construct(self, inputs, is_training=False):
        embeds = self.embed(inputs)  # BxWxD
        concated = embeds.view(-1, embeds.shape[1] * embeds.shape[2])  # Bx(W*D)
        h0 = self.relu(self.h_dense1(concated))
        if is_training:
            h0 = self.dropout(h0)
        h1 = self.relu(self.h_dense2(h0))
        if is_training:
            h1 = self.dropout(h1)
        out = self.softmax(self.o_layer(h1))
        return out

In [17]:
BATCH_SIZE = 128
EMBEDDING_SIZE = 50  # x (WINDOW_SIZE*2+1) = 250
HIDDEN_SIZE = 300
EPOCH = 3
LEARNING_RATE = 0.001

## Training 

It takes for a while if you use just cpu.

In [23]:
model = WindowClassifier(len(word2index), EMBEDDING_SIZE, WINDOW_SIZE, HIDDEN_SIZE, len(tag2index))
loss_function = nn.CrossEntropyLoss()
optimizer = nn.Adam(model.trainable_params(), learning_rate=LEARNING_RATE)

In [26]:
accumulate_step = 2
accumulator = Accumulator(optimizer, accumulate_step)


def forward_fn(x, y, is_training):
    """Forward function"""
    inputs = ops.cat([prepare_sequence(sent, word2index).view(1, -1) for sent in x])
    targets = ops.cat([prepare_tag(tag, tag2index) for tag in y])
    preds = model(inputs, is_training)
    targets = targets.astype(mindspore.int32)
    loss = loss_function(preds, targets)
    return loss / accumulate_step


# Get gradient function
grad_fn = mindspore.value_and_grad(forward_fn, None, model.trainable_params())


# Define function of one-step training
def train_step(x, y, is_training):
    """Training steps"""
    loss, grads = grad_fn(x, y, is_training)
    loss = ops.depend(loss, accumulator(grads))
    return loss

In [27]:
for epoch in range(EPOCH):
    losses = []
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        x, y = list(zip(*batch))

        loss = train_step(x, y, is_training=True)
        losses.append(loss.asnumpy().item() * accumulate_step)

        if i % 1000 == 0:
            print("[%d/%d] mean_loss : %0.2f" % (epoch, EPOCH, np.mean(losses)))
            losses = []

[0/3] mean_loss : 2.20
[0/3] mean_loss : 0.42
[0/3] mean_loss : 0.22
[0/3] mean_loss : 0.17
[0/3] mean_loss : 0.13
[1/3] mean_loss : 0.01
[1/3] mean_loss : 0.05
[1/3] mean_loss : 0.04
[1/3] mean_loss : 0.04
[1/3] mean_loss : 0.04
[2/3] mean_loss : 0.00
[2/3] mean_loss : 0.02
[2/3] mean_loss : 0.02
[2/3] mean_loss : 0.02
[2/3] mean_loss : 0.02


## Test 

In [28]:
for_f1_score = []

In [34]:
accuracy = 0
for test in test_data:
    x, y = test[0], test[1]
    input_ = prepare_sequence(x, word2index).view(1, -1)

    i = ops.max(model(input_), axis=1)
    i = i[1]
    pred = index2tag[i.asnumpy().item(0)]
    for_f1_score.append([pred, y])
    if pred == y:
        accuracy += 1

print(accuracy / len(test_data) * 100)

98.30183672867715


This high score is because most of labels are 'O' tag. So we need to measure f1 score.

### Print Confusion matrix 

In [35]:
y_pred, y_test = list(zip(*for_f1_score))

In [36]:
sorted_labels = sorted(
    list(set(y_test) - {'O'}),
    key=lambda name: (name[1:], name[0])
)

In [37]:
sorted_labels

['B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

In [38]:
y_pred = [[y] for y in y_pred]  # this is because sklearn_crfsuite.metrics function flatten inputs
y_test = [[y] for y in y_test]

In [42]:
report = classification_report(y_test, y_pred, labels=sorted_labels, digits=3)
print(report)

              precision    recall  f1-score   support

       B-LOC      0.911     0.824     0.865      1136
       I-LOC      0.890     0.856     0.873       320
      B-MISC      0.779     0.810     0.794       801
      I-MISC      0.828     0.841     0.834       646
       B-ORG      0.884     0.897     0.891      1343
       I-ORG      0.874     0.927     0.899       917
       B-PER      0.953     0.926     0.939      1304
       I-PER      0.964     0.972     0.968       961

   micro avg      0.892     0.888     0.890      7428
   macro avg      0.885     0.882     0.883      7428
weighted avg      0.893     0.888     0.890      7428



### TODO

* use max-margin objective function http://pytorch.org/docs/master/nn.html#multilabelmarginloss