# Sequence Labelling using Word2Vec LSTM

Pada notebook kami mengimplementasikan sequence labelling menggunakan LSTM. Pertama yang perlu dilakukan adalah menginstal library dan mengunduh data yang penting digunakan pada eksperimen ini. Kami menggunakan data dari lirik lagu pada eksperimen sebelumnya dan mengambil sebanyak 1080 kalimat.

In [None]:
!pip install stanza
!git clone https://github.com/otakbeku/nlpstei.git
!pip install wandb --upgrade

Kami menggunakan wandb sebagai alat bantu untuk memantau proses berjalannya setiap iterasi

In [None]:
import wandb

wandb.login()
wandb.init(project="sequence labelling")

## Import library

In [None]:
import nltk
import pandas as pd
import heapq
import pprint

from nltk.tokenize import wordpunct_tokenize, blankline_tokenize, line_tokenize, word_tokenize
from itertools import combinations
from nltk.corpus import stopwords
from time import time 
from gensim.models import Word2Vec, KeyedVectors
import multiprocessing
from collections import namedtuple

# # tensorflow
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Pytorch
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

# stanza
import stanza as st

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from tqdm.notebook import tqdm

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
torch.backends.cudnn.deterministic = True

## Mengunduh vocabulary untuk stanza

Kami menggunakan stanza untuk memberikan label pada masing-masing token.

In [None]:
st.download('en')

## Glove sebagai Embedding matrix

Kami menggunakan pretrained model untuk embedding matrix. Pada baris terakhir terdapat keterangan bahwa kami menambahkan array baru pada matrix yang digunakan sebagai padding dan unknown word yang tidak terdapat pada vocab yang diberikan

In [None]:
# Pretrained word2vec
import gensim.downloader as api
corpus = api.load('glove-wiki-gigaword-50', return_path=True)
pretrainedwvmodel = KeyedVectors.load_word2vec_format(corpus)
embedding_matrix = pretrainedwvmodel.wv.vectors
embedding_matrix = np.append(embedding_matrix, np.zeros((1,50)), axis=0) # Padding
embedding_matrix = np.append(embedding_matrix, np.zeros((1,50)), axis=0)
embedding_matrix = np.append(embedding_matrix, np.zeros((1,50)), axis=0) # Unknown word

## Tag as Class

Ada 38 tag yang kami gunakan pada eksperimen ini. Tag ini didapat dari Penn treebank POS tagger. Untuk tag yang tidak diketahui atau tidak terdapat pada vocab maka akan digantikan dengan `<UNK>`. Sedangkan untuk padding menggunakan `<PAD>`

In [None]:
TAG2CLASS = {
    '<PAD>': 0,
    'CC': 1,
    'CD': 2,
    'DT': 3,
    'EX': 4,
    'FW': 5,
    'IN': 6,
    'JJ': 7,
    'JJR': 8,
    'JJS': 9,
    'LS': 10,
    'MD': 11,
    'NN': 12,
    'NNS': 13,
    'NNP': 14,
    'NNPS': 15,
    'PDT': 16,
    'POS': 17,
    'PRP': 18,
    'PRP$': 19,
    'RB': 20,
    'RBR': 21,
    'RBS': 22,
    'RP': 23,
    'SYM': 24,
    'TO': 25,
    'UH': 26,
    'VB': 27,
    'VBD': 28,
    'VBG': 29,
    'VBN': 30,
    'VBP': 31,
    'VBZ': 32,
    'WDT': 33,
    'WP': 34,
    'WP$': 35,
    'WRB': 36,
    '-RRB-': 37,
    '-LRB-':38,
        '<UNK>': 0,
    
}
pos_tagger = st.Pipeline(lang='en', use_gpu=False)

## Mapping data dengan label

Kami membuat Dataset sendiri yang khusus dipakai pada eksperimen ini. Secara sederhana, kelas DataMapper1 ini akan menghasilkan data dengan label berupa sequence. Data dalam bentuk kalimat yang sudah ditokenisasi dan label berupa POS tagger dari masing-masing token.

In [None]:
class DataMapper1(Dataset):
    def __init__(self, sentence_lyrics, wvmodel, sequence_len):
        self.sents = sentence_lyrics
        self.sequence_len = sequence_len
        self.model = wvmodel

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        doc = pos_tagger(self.sents[idx])
        xl = []
        yl = []
        seq = np.zeros(self.sequence_len, dtype=np.int64)
        yseq = np.zeros(self.sequence_len, dtype=np.int64)
        for k in doc.sentences[0].words:
            if (self.model.wv.vocab.get(k.text) is None):
                xl.append(400002)
                yl.append(TAG2CLASS.get('<UNK>'))
                continue
            xl.append(self.model.wv.vocab.get(k.text).index)
            yl.append(TAG2CLASS.get(k.xpos, 0))
        seq[:len(xl)] = xl[:self.sequence_len]
        yseq[:len(yl)] = yl[:self.sequence_len]
        return seq, yseq


Ini adalah proses seleksi data

In [None]:
data = pd.read_csv('./nlpstei/models/Sentences_15klyrics_mls_20.csv')
train_data = data.sent[:800].to_numpy()
val_random = np.random.choice(data[:800].to_numpy().flatten(), 80)
val_data = np.append(val_random, data.sent[1001:1081].to_numpy())
test_data = data.sent[800:1001].to_numpy()

In [None]:
training_set = DataMapper1(train_data, pretrainedwvmodel, 20)
val_set = DataMapper1(val_data, pretrainedwvmodel, 20)
test_set = DataMapper1(test_data, pretrainedwvmodel, 20)

In [None]:
loader_training = DataLoader(training_set, batch_size=16)
loader_val = DataLoader(training_set, batch_size=16)
loader_test = DataLoader(test_set)

## Model LSTM

Kami menggunakan model LSTM sederhana dengan hanya menggunakan 1 layer LSTM

In [None]:
class Simple_Sequence_LSTMver2(nn.Module):

    def __init__(self, args):
        super(Simple_Sequence_LSTMver2, self).__init__()
        # Hyperparameters
        self.hidden_dim = args.hidden_dim
        self.LSTM_layers = args.lstm_layers
        self.embedding_matrix = args.embedding_matrix.cuda()
        self.target_size = args.target_size
        self.tag_class_size = args.class_number

        self.word_embeddings = nn.Embedding.from_pretrained(
            self.embedding_matrix, padding_idx=args.padding_idx, freeze=True)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(self.hidden_dim, self.hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(self.hidden_dim, self.tag_class_size)

    def forward(self, sentence):
        # # Hidden and cell state definion
        # h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim)).cuda()
        # c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim)).cuda()

        # # Initialization fo hidden and cell states
        # torch.nn.init.xavier_normal_(h)
        # torch.nn.init.xavier_normal_(c)

        embeds = self.word_embeddings(sentence)
        # lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1),(h, c))
        # print(embeds.view(len(sentence), 1, -1).shape)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        # print(lstm_out.view(len(sentence), -1).shape)
        # tag_space = tag_space.view(len(sentence), self.tag_class_size)
        tag_scores = torch.sigmoid_(tag_space)
        return tag_scores

In [None]:
embedding_matrix = torch.FloatTensor(embedding_matrix)
train_on_gpu = torch.cuda.is_available()
lstm_dict = {
    # 'batch_size':8,
    'hidden_dim': embedding_matrix.shape[1],
    'lstm_layers':3,
    # 'input_size':embedding_matrix.shape[0],
    'padding_idx': 400001,
    'target_size': 20,
    'class_number': 40,
    'embedding_matrix': embedding_matrix
}
lstm_args = namedtuple('lstm_args', lstm_dict.keys())(**lstm_dict)

In [None]:
model = Simple_Sequence_LSTMver2(lstm_args).cuda()

In [None]:
model

In [None]:
def categorical_accuracy(preds, y, tag_pad_idx=0):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements]).cuda()
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]]).cuda()

In [None]:
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.SGD(model.parameters(), lr=0.01,momentum=0.9,weight_decay=0.0001)
loss_function = nn.CrossEntropyLoss()

In [None]:
def validation_metrics (model, valid_dl):
    loss_function = nn.CrossEntropyLoss()
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    acc_total = 0.0
    for x, y in valid_dl:
        x = x.cuda()
        y = y.cuda()
        y_hat = model(x)
        y_pred_2 = y_hat.view(-1, y_hat.shape[-1])
        y_2 = y.view(-1)
        loss = loss_function(y_pred_2, y_2)
        pred = torch.max(y_hat, 0)[1]
        correct += categorical_accuracy(y_pred_2, y_2).item()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]

    return sum_loss/total, correct/total

In [None]:
wandb.watch(model, loss_function, log="all", log_freq=10)

## Proses Training


Proses training dilakukan sebanyak 10 epoch dan pada setiap iterasi jika ditemukan hasil validasi yang bagus, maka model tersebut akan disimpan

In [None]:
epochs = 10
validation_loss_min = np.inf
for i in tqdm(range(epochs)):
    model.train()
    sum_loss = 0.0
    total = 0
    for x, y in loader_training:
        x = torch.tensor(x).to(torch.long).cuda()
        y_pred = model(x)
        y = torch.tensor(y).to(torch.long).cuda()
        
        optimizer.zero_grad()
        y_pred_max = torch.argmax(y_pred, dim=2)
        y_pred_2 = y_pred.view(-1, y_pred.shape[-1])
        y_2 = y.view(-1)
        loss = loss_function(y_pred_2, y_2)
        loss.backward()
        optimizer.step()
        sum_loss += loss.item()*y_2.shape[0]
        total += y.shape[0]
        acc = categorical_accuracy(y_pred_2, y_2)
    val_loss, val_acc = validation_metrics(model, loader_val)
#     wandb.log(f'Epoch: {i}\tTraining loss: {sum_loss}\tValidation loss: {val_loss}')
    wandb.log({'training_loss':sum_loss, 'validation_loss':val_loss})
    print(f'Epoch: {i}\tTraining loss: {sum_loss}\tValidation loss: {val_loss}')
    if val_loss <= validation_loss_min:
        print('\t\tValidation loss-nya lebih kecil!')
#         wandb.log('\t\tValidation loss-nya lebih kecil!')
        torch.save(model.state_dict(),"../working/model_seq_lyrics_best3.pth")
        validation_loss_min = val_loss
    if i % 5 == 1:
        print("train loss %.3f, val loss %.3f, val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))
#         wandb.log("train loss %.3f, val loss %.3f, val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))
        train_loss_epoch = sum_loss/total
        val_loss_epoch = val_loss
        val_acc_epoch = val_acc
        wandb.log({'train_loss_mod': train_loss_epoch, 'val_loss_epoch': val_loss, 'val_accuracy': val_acc})


In [None]:
# Cloud Storage
from google.cloud import storage
storage_client = storage.Client(project='239480140419')

In [None]:
bucket = storage_client.bucket('song_lyrics')
blob = bucket.blob("model_seq_lyrics_best3.pth")
blob.upload_from_filename("../working/model_seq_lyrics_best3.pth")

In [None]:
val_loss, val_acc = validation_metrics(model, loader_val)
print("val loss %.3f, val accuracy %.3f" % (val_loss, val_acc))

In [None]:
test_loss, test_acc = validation_metrics(model, loader_test)
print("test loss %.3f, test accuracy %.3f" % (test_loss, test_acc))