In [1]:
from os import getcwd, path, environ
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

BASE_PATH = getcwd()
sys.path.append(BASE_PATH)

environ['NUM_WORKERS'] = '0'
# environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
data = pd.read_csv('data/vn_sentiment/train.csv')

# replace new lines with space
data['comment'] = data['comment'].str.replace('\n', ' ')

data.head()

Unnamed: 0,id,comment,label
0,train_000000,Dung dc sp tot cam on shop Đóng gói sản phẩm ...,0
1,train_000001,Chất lượng sản phẩm tuyệt vời . Son mịn nhưng...,0
2,train_000002,Chất lượng sản phẩm tuyệt vời nhưng k có hộp ...,0
3,train_000003,:(( Mình hơi thất vọng 1 chút vì mình đã kỳ vọ...,1
4,train_000004,Lần trước mình mua áo gió màu hồng rất ok mà đ...,1


In [3]:
from sent_to_vec.masked_lm.bert_model import BertLMWrapper
from sent_to_vec.masked_lm.train import LanguageModelLearner
from sent_to_vec.masked_lm.vi_data import ViTextDataset

from common.modules import BertAdam
from common.callbacks import PrintLoggerCallback, EarlyStoppingCallback, ModelCheckpointCallback, TensorboardCallback, ReduceLROnPlateau

Default language for this instance: en


In [4]:
model = BertLMWrapper(from_fp='bert_vi_base.bin')
model.init_model()

dataset = ViTextDataset()
dataset.initialize(model, data_texts=list(data['comment']))

Featurizer previously fitted, continuing
Found 1321308 tokens
Top 5 words: <START>, <STOP>, <UNK>, <MASK>, ,


In [5]:
BATCH_SIZE = 16
n_epochs=10

learner = LanguageModelLearner(model,
    optimizer_fn=BertAdam,
    optimizer_kwargs={
        'lr': 3e-5,
        't_total': n_epochs * (len(dataset) // BATCH_SIZE),
        'warmup': 0.04
    })

Training in BERT mode


In [6]:
learner.fit(
    training_data=dataset,
    batch_size=BATCH_SIZE,
    epochs=n_epochs,
    callbacks=[
        PrintLoggerCallback(log_every=1, metrics=['loss']),
    ],
    gradient_accumulation_steps=10,
    clip_grad=1.0,
    fp16=True
)


Gradient accumulation is supported by this class
Number of tokens 30000
BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 512, padding_idx=0)
      (position_embeddings): Embedding(100, 512, padding_idx=0)
      (token_type_embeddings): Embedding(2, 512, padding_idx=0)
      (LayerNorm): FusedLayerNorm(torch.Size([512]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, 

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
3m 38s (- 3m 38s) (5 50%) - loss: 0.3225
4m 21s (- 2m 54s) (6 60%) - loss: 0.3165
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0
5m 3s (- 2m 10s) (7 70%) - loss: 0.3125
5m 46s (- 1m 26s) (8 80%) - loss: 0.3075
6m 30s (- 0m 43s) (9 90%) - loss: 0.3053
7m 13s (- 0m 0s) (10 100%) - loss: 0.3025


In [7]:
# model.save('bert_vi_sentiment_lm.bin')

In [8]:
from text_classification.with_pretrained.model import LMClassifierWrapper
from text_classification.with_pretrained.train import LMClassifierLearner

In [9]:
classifier = LMClassifierWrapper({
    'encoder': model
})
classifier_learner = LMClassifierLearner(
    classifier,
    optimizer_fn='adam'
)

In [10]:
classifier_learner.fit(
    training_data=(data['comment'], data['label']), 
    batch_size=BATCH_SIZE,
    epochs=50, 
    callbacks=[PrintLoggerCallback(log_every=5)]
)



4m 19s (- 38m 54s) (5 10%) - loss: 0.1668 - accuracy: 0.9558
8m 36s (- 34m 27s) (10 20%) - loss: 0.1133 - accuracy: 0.9782
12m 53s (- 30m 4s) (15 30%) - loss: 0.0951 - accuracy: 0.9835
17m 10s (- 25m 45s) (20 40%) - loss: 0.0842 - accuracy: 0.9873
21m 27s (- 21m 27s) (25 50%) - loss: 0.0798 - accuracy: 0.9889
25m 44s (- 17m 9s) (30 60%) - loss: 0.0819 - accuracy: 0.9874
30m 1s (- 12m 51s) (35 70%) - loss: 0.0760 - accuracy: 0.9897
34m 18s (- 8m 34s) (40 80%) - loss: 0.0746 - accuracy: 0.9902
38m 35s (- 4m 17s) (45 90%) - loss: 0.0725 - accuracy: 0.9909
42m 53s (- 0m 0s) (50 100%) - loss: 0.0671 - accuracy: 0.9925


In [11]:
test_data = pd.read_csv('data/vn_sentiment/test.csv')

# replace new lines with space
test_data['comment'] = test_data['comment'].str.replace('\n', ' ')

In [12]:
test_data.head()

Unnamed: 0,id,comment
0,test_000000,Chưa dùng thử nên chưa biết
1,test_000001,Không đáng tiềnVì ngay đợt sale nên mới mua n...
2,test_000002,Cám ơn shop. Đóng gói sản phẩm rất đẹp và chắc...
3,test_000003,Vải đẹp.phom oki luôn.quá ưng
4,test_000004,Chuẩn hàng đóng gói đẹp


In [13]:
import torch
test_item = [test_data['comment'][5]]
torch.softmax(classifier(test_item, return_logits=True)[0], dim=1)

tensor([[0.9596, 0.0133, 0.0271]], device='cuda:0')

In [14]:
classifier.save('bert_vi_sentiment.bin')



In [15]:
raw_model = classifier.model

In [16]:
classifier.label_encoder.classes_.tolist()

1

In [17]:
classifier(test_item)

ValueError: y contains previously unseen labels: [2]

In [None]:
_, seq_tokens = raw_model.encoder(test_item)
sequence_output = raw_model.rnn(seq_tokens)[0]
print(sequence_output)

output, idxs = torch.max(sequence_output, 0)
print(idxs)
idxs = idxs.data.cpu().numpy()

sent = raw_model.encoder.featurizer.transform(test_item)
raw_sent = raw_model.encoder.featurizer.inverse_transform(sent)
print(raw_sent)

import matplotlib.pyplot as plt

argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))]
# argmaxs[0] = 1e-8
print(argmaxs)
x = range(len(sent[0]))
y = [100.0 * n / np.sum(argmaxs) for n in argmaxs]
print(y)

plt.xticks(x, raw_sent[0], rotation=45)
plt.bar(x, y)
plt.ylabel('%')
plt.title('Visualisation of words importance')
plt.show()

In [None]:
loaded_model = LMClassifierWrapper(from_fp='bert_vi_sentiment.bin')
loaded_model.init_model(update_configs={'encoder': model})

print(loaded_model(test_item, return_logits=True)[0])