In [1]:
from os import getcwd, path, environ
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

BASE_PATH = getcwd()
sys.path.append(BASE_PATH)

environ['NUM_WORKERS'] = '0'
# environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
data = pd.read_csv('data/vn_sentiment/train.csv')

# replace new lines with space
data['comment'] = data['comment'].str.replace('\n', ' ')

data.head()

Unnamed: 0,id,comment,label
0,train_000000,Dung dc sp tot cam on shop Đóng gói sản phẩm ...,0
1,train_000001,Chất lượng sản phẩm tuyệt vời . Son mịn nhưng...,0
2,train_000002,Chất lượng sản phẩm tuyệt vời nhưng k có hộp ...,0
3,train_000003,:(( Mình hơi thất vọng 1 chút vì mình đã kỳ vọ...,1
4,train_000004,Lần trước mình mua áo gió màu hồng rất ok mà đ...,1


In [3]:
data.loc[data['label'] == 0, 'label'] = 'positive'
data.loc[data['label'] == 1, 'label'] = 'negative'
data.head()

Unnamed: 0,id,comment,label
0,train_000000,Dung dc sp tot cam on shop Đóng gói sản phẩm ...,positive
1,train_000001,Chất lượng sản phẩm tuyệt vời . Son mịn nhưng...,positive
2,train_000002,Chất lượng sản phẩm tuyệt vời nhưng k có hộp ...,positive
3,train_000003,:(( Mình hơi thất vọng 1 chút vì mình đã kỳ vọ...,negative
4,train_000004,Lần trước mình mua áo gió màu hồng rất ok mà đ...,negative


In [4]:
from sent_to_vec.masked_lm.bert_model import BertLMWrapper
from sent_to_vec.masked_lm.train import LanguageModelLearner
from sent_to_vec.masked_lm.vi_data import ViTextDataset

from common.modules import BertAdam
from common.callbacks import PrintLoggerCallback, EarlyStoppingCallback, ModelCheckpointCallback, TensorboardCallback, ReduceLROnPlateau

Default language for this instance: en


In [5]:
model = BertLMWrapper(from_fp='bert_vi_base.bin')
model.init_model()

dataset = ViTextDataset()
dataset.initialize(model, data_texts=list(data['comment']))

Featurizer previously fitted, continuing
Found 1321308 tokens
Top 5 words: <START>, <STOP>, <UNK>, <MASK>, ,


In [6]:
BATCH_SIZE = 16
n_epochs=4

learner = LanguageModelLearner(model,
    optimizer_fn=BertAdam,
    optimizer_kwargs={
        'lr': 2e-5,
        't_total': n_epochs * (len(dataset) // BATCH_SIZE),
        'warmup': 0.04
    })

Training in BERT mode


In [7]:
import torch
learner.fit(
    training_data=dataset,
    batch_size=BATCH_SIZE,
    epochs=n_epochs,
    callbacks=[
        PrintLoggerCallback(log_every=1, metrics=['loss']),
    ],
    gradient_accumulation_steps=10,
    clip_grad=1.0,
)


Gradient accumulation is supported by this class
Number of tokens 30000
BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 512, padding_idx=0)
      (position_embeddings): Embedding(100, 512, padding_idx=0)
      (token_type_embeddings): Embedding(2, 512, padding_idx=0)
      (LayerNorm): FusedLayerNorm(torch.Size([512]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, 

In [8]:
# model.save('bert_vi_sentiment_lm.bin')

In [9]:
from text_classification.with_pretrained.model import LMClassifierWrapper
from text_classification.with_pretrained.train import LMClassifierLearner

In [10]:
n_epochs=10
classifier = LMClassifierWrapper(encoder=model)
classifier_learner = LMClassifierLearner(
    classifier,
    optimizer_fn=BertAdam,
    optimizer_kwargs={
        'lr': 2e-5,
        't_total': n_epochs * (len(dataset) // BATCH_SIZE),
        'warmup': 0.04
    }
)

In [11]:
# classifier.model.encoder

In [12]:
classifier_learner.fit(
    training_data=(data['comment'], data['label']), 
    batch_size=BATCH_SIZE,
    epochs=n_epochs, 
    callbacks=[PrintLoggerCallback(log_every=1)]
)

{'num_words': 30000, 'hidden_size': 512, 'num_hidden_layers': 6, 'num_attention_heads': 8, 'intermediate_size': 1140, 'hidden_act': 'gelu', 'hidden_dropout_prob': 0.1, 'attention_probs_dropout_prob': 0.1, 'max_position_embeddings': 100, 'featurizer_seq_len': 100, 'type_vocab_size': 2, 'initializer_range': 0.02, 'use_adasoft': True, 'append_sos_eos': True, 'featurizer_reserved_tokens': ['<START>', '<STOP>', '<UNK>', '<MASK>'], 'tokenize_fn': <function word_tokenize at 0x7f55f8dc8ae8>, 'input_shape': (None,), 'adasoft_cutoffs': [2000, 4000, 10000]}




0m 47s (- 7m 9s) (1 10%) - loss: 0.3850 - accuracy: 0.8287
1m 35s (- 6m 21s) (2 20%) - loss: 0.2531 - accuracy: 0.8942
2m 22s (- 5m 33s) (3 30%) - loss: 0.2094 - accuracy: 0.9165
3m 10s (- 4m 46s) (4 40%) - loss: 0.1642 - accuracy: 0.9377
3m 58s (- 3m 58s) (5 50%) - loss: 0.1279 - accuracy: 0.9534
4m 45s (- 3m 10s) (6 60%) - loss: 0.1025 - accuracy: 0.9635
5m 33s (- 2m 22s) (7 70%) - loss: 0.0781 - accuracy: 0.9716
6m 20s (- 1m 35s) (8 80%) - loss: 0.0639 - accuracy: 0.9771
7m 8s (- 0m 47s) (9 90%) - loss: 0.0553 - accuracy: 0.9801
7m 55s (- 0m 0s) (10 100%) - loss: 0.0456 - accuracy: 0.9829


In [13]:
test_data = pd.read_csv('data/vn_sentiment/test.csv')

# replace new lines with space
test_data['comment'] = test_data['comment'].str.replace('\n', ' ')

In [14]:
test_data.head()

Unnamed: 0,id,comment
0,test_000000,Chưa dùng thử nên chưa biết
1,test_000001,Không đáng tiềnVì ngay đợt sale nên mới mua n...
2,test_000002,Cám ơn shop. Đóng gói sản phẩm rất đẹp và chắc...
3,test_000003,Vải đẹp.phom oki luôn.quá ưng
4,test_000004,Chuẩn hàng đóng gói đẹp


In [15]:
import torch
test_item = [test_data['comment'][5]]
classifier(test_item, return_logits=True)[0]

tensor([[5.2737e-04, 9.9946e-01, 9.8135e-06]], device='cuda:0')

In [16]:
classifier.save('bert_vi_sentiment.bin')

In [17]:
raw_model = classifier.model

In [18]:
classifier.label_encoder.classes_.tolist()

['negative', 'positive']

In [19]:
classifier(test_item)

[[{'intent': '<unknown>', 'confidence': 0.9994627833366394},
  {'intent': '<unknown>', 'confidence': 0.0005273708957247436}]]

In [20]:
# from common.torch_utils import to_gpu

# _, seq_tokens = raw_model.encoder(
#     to_gpu(classifier.featurizer.transform(test_item))
# )
# sequence_output = raw_model.rnn(seq_tokens)[0]
# print(sequence_output)

# output, idxs = torch.max(sequence_output, 0)
# print(idxs)
# idxs = idxs.data.cpu().numpy()

# sent = classifier.featurizer.transform(test_item)
# raw_sent = classifier.featurizer.inverse_transform(sent)
# print(raw_sent)

# import matplotlib.pyplot as plt

# argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))]
# # argmaxs[0] = 1e-8
# print(argmaxs)
# x = range(len(sent[0]))
# y = [100.0 * n / np.sum(argmaxs) for n in argmaxs]
# print(y)

# plt.xticks(x, raw_sent[0], rotation=45)
# plt.bar(x, y)
# plt.ylabel('%')
# plt.title('Visualisation of words importance')
# plt.show()

In [21]:
loaded_model = LMClassifierWrapper(from_fp='bert_vi_sentiment.bin')
loaded_model.init_model()

print(loaded_model(test_item))

{'num_words': 30000, 'hidden_size': 512, 'num_hidden_layers': 6, 'num_attention_heads': 8, 'intermediate_size': 1140, 'hidden_act': 'gelu', 'hidden_dropout_prob': 0.1, 'attention_probs_dropout_prob': 0.1, 'max_position_embeddings': 100, 'featurizer_seq_len': 100, 'type_vocab_size': 2, 'initializer_range': 0.02, 'use_adasoft': True, 'append_sos_eos': True, 'featurizer_reserved_tokens': ['<START>', '<STOP>', '<UNK>', '<MASK>'], 'tokenize_fn': <function word_tokenize at 0x7f55f8dc8ae8>, 'input_shape': (None,), 'adasoft_cutoffs': [2000, 4000, 10000]}
[[{'intent': '<unknown>', 'confidence': 0.9994627833366394}, {'intent': '<unknown>', 'confidence': 0.0005273708957247436}]]


In [22]:
classifier(test_item, return_logits=True)[0]

tensor([[5.2737e-04, 9.9946e-01, 9.8135e-06]], device='cuda:0')

In [23]:
print(loaded_model.__getstate__()['state_dict'])

OrderedDict([('encoder.bert.embeddings.word_embeddings.weight', tensor([[ 0.0305, -0.0348,  0.0343,  ..., -0.0633, -0.1019,  0.0352],
        [ 0.0081,  0.0379,  0.0248,  ...,  0.0319, -0.0708,  0.0006],
        [-0.0293, -0.0125, -0.0445,  ...,  0.0325,  0.0157, -0.0374],
        ...,
        [ 0.0011,  0.0007, -0.0067,  ...,  0.0002,  0.0036,  0.0053],
        [ 0.0163, -0.0066, -0.0042,  ..., -0.0095,  0.0041, -0.0151],
        [-0.0271, -0.0092, -0.0186,  ..., -0.0090, -0.0034,  0.0319]])), ('encoder.bert.embeddings.position_embeddings.weight', tensor([[-8.1516e-03,  5.7148e-03,  2.0596e-02,  ...,  3.5434e-03,
          1.6124e-02,  4.3819e-03],
        [ 3.4599e-02,  3.0122e-05,  2.5056e-02,  ..., -1.9088e-02,
          1.3316e-02, -9.9957e-03],
        [-1.7099e-02, -3.4879e-02,  2.3884e-02,  ...,  1.9209e-02,
          1.1689e-02, -1.1790e-02],
        ...,
        [-2.3061e-03,  1.9220e-02, -2.7194e-03,  ...,  2.5389e-02,
          1.8511e-02, -5.2746e-03],
        [-2.3428e-02

In [24]:
print(classifier.__getstate__()['state_dict'])

OrderedDict([('encoder.bert.embeddings.word_embeddings.weight', tensor([[ 0.0305, -0.0348,  0.0343,  ..., -0.0633, -0.1019,  0.0352],
        [ 0.0081,  0.0379,  0.0248,  ...,  0.0319, -0.0708,  0.0006],
        [-0.0293, -0.0125, -0.0445,  ...,  0.0325,  0.0157, -0.0374],
        ...,
        [ 0.0011,  0.0007, -0.0067,  ...,  0.0002,  0.0036,  0.0053],
        [ 0.0163, -0.0066, -0.0042,  ..., -0.0095,  0.0041, -0.0151],
        [-0.0271, -0.0092, -0.0186,  ..., -0.0090, -0.0034,  0.0319]])), ('encoder.bert.embeddings.position_embeddings.weight', tensor([[-8.1516e-03,  5.7148e-03,  2.0596e-02,  ...,  3.5434e-03,
          1.6124e-02,  4.3819e-03],
        [ 3.4599e-02,  3.0122e-05,  2.5056e-02,  ..., -1.9088e-02,
          1.3316e-02, -9.9957e-03],
        [-1.7099e-02, -3.4879e-02,  2.3884e-02,  ...,  1.9209e-02,
          1.1689e-02, -1.1790e-02],
        ...,
        [-2.3061e-03,  1.9220e-02, -2.7194e-03,  ...,  2.5389e-02,
          1.8511e-02, -5.2746e-03],
        [-2.3428e-02

In [25]:
def compare_models(model_1, model_2):
    models_differ = 0
    for key_item_1, key_item_2 in zip(model_1.state_dict().items(), model_2.state_dict().items()):
        if torch.equal(key_item_1[1], key_item_2[1]):
            pass
        else:
            models_differ += 1
            if (key_item_1[0] == key_item_2[0]):
                print('Mismtach found at', key_item_1[0])
            else:
                raise Exception
    if models_differ == 0:
        print('Models match perfectly! :)')


In [26]:
compare_models(loaded_model.model, classifier.model)

Models match perfectly! :)


In [27]:
loaded_model.featurizer.transform(test_item)

tensor([[   1,    3, 2904,  347,  618,  423, 1178,   14, 1792, 2144,    3,  745,
          325,  423,  792,  908,   58,    3,    2]])

In [28]:
X_train = classifier.featurizer.transform(test_item).cuda()

In [29]:
raw_loaded_model = loaded_model.model
raw_classifier = classifier.model

In [30]:
raw_loaded_model.eval()
raw_classifier.eval()
raw_loaded_model.encoder = raw_classifier.encoder
raw_loaded_model(X_train)[0] - raw_classifier(X_train)[0]

tensor([[0., 0., 0.]], device='cuda:0', grad_fn=<SubBackward0>)

In [31]:
from common.torch_utils import cauchy
raw_loaded_model.eval()
raw_classifier.eval()

encoded = raw_loaded_model.encoder(X_train)[1]
rnn = raw_loaded_model.rnn(encoded)[0]
max_pool = torch.max(rnn, 1)[0]
pooled = max_pool
prev_pooled = max_pool
# print(prev_pooled - pooled)
# pooled = raw_loaded_model.pooler(max_pool)
for ix, layer in enumerate(raw_loaded_model.pooler):
    pooled = layer(pooled)
    prev_pooled = raw_classifier.pooler[ix](prev_pooled)
#     print(ix)
#     print(prev_pooled - pooled)
#     print(layer.weight - raw_classifier.pooler[ix].weight)
#     print(layer.bias - raw_classifier.pooler[ix].bias)
#     print(ix)
#     print(len(raw_loaded_model.pooler))
#     print(pooled - prev_pooled)
    if ix < len(raw_loaded_model.pooler) - 1:
        pooled = torch.nn.functional.relu(pooled)
        prev_pooled = torch.nn.functional.relu(prev_pooled)
#         print(prev_pooled - pooled)
#         print(pooled)
    else:
        pooled = cauchy(pooled)
        prev_pooled = cauchy(prev_pooled)
#         print(prev_pooled - pooled)
#         print(pooled)

# print(pooled - prev_pooled)
print(raw_loaded_model.classifier.bias)
print(raw_classifier.classifier.bias)
with torch.no_grad():
    print(raw_loaded_model.classifier.weight - raw_classifier.classifier.weight)
    print(raw_loaded_model.classifier(prev_pooled))
    print(raw_classifier.classifier(prev_pooled))

AttributeError: 'LMClassifier' object has no attribute 'rnn'

In [None]:
print(max_pool)

In [None]:
print(loaded_model.model.encoder == raw_loaded_model.encoder)
print(loaded_model(test_item))