In [None]:
!pip install flair

In [None]:
!mkdir data

In [2]:
!ls

dev.txt  Flair_Training_Model.ipynb  test.txt  train.txt


In [3]:
# 1. get the corpus
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '.'

# retrieve corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')

2021-07-10 08:45:49,618 Reading data from .
2021-07-10 08:45:49,618 Train: train.txt
2021-07-10 08:45:49,619 Dev: dev.txt
2021-07-10 08:45:49,619 Test: test.txt


In [4]:
print(corpus)


Corpus: 7480 train + 2825 dev + 4264 test sentences


In [5]:
print(corpus.train[0].to_tagged_string('ner'))

bà này khi trở về quá cảnh doha <B-LOCATION> ( qatar <B-LOCATION> ) , đáp xuống tân <B-LOCATION> sơn <I-LOCATION> nhất <I-LOCATION> sáng 2/3 cùng 75 hành khách , trong đó có 55 người nước ngoài .


In [6]:
from flair.embeddings import TokenEmbeddings, WordEmbeddings, BertEmbeddings, StackedEmbeddings, BytePairEmbeddings
from typing import List

In [7]:

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)


[b'<unk>', b'O', b'B-LOCATION', b'I-LOCATION', b'<START>', b'<STOP>']


In [None]:
!wget http://download2266.mediafire.com/f8bslkwu4lzg/krx1rw1sx431rqh/model.zip

--2019-03-18 07:18:53--  http://download2266.mediafire.com/f8bslkwu4lzg/krx1rw1sx431rqh/model.zip
Resolving download2266.mediafire.com (download2266.mediafire.com)... 199.91.155.7
Connecting to download2266.mediafire.com (download2266.mediafire.com)|199.91.155.7|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 163819626 (156M) [application/zip]
Saving to: ‘model.zip’


2019-03-18 07:19:13 (7.83 MB/s) - ‘model.zip’ saved [163819626/163819626]



In [None]:
!unzip model.zip

Archive:  model.zip
  inflating: model.txt               


In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.vec.gz

--2019-03-18 07:19:56--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.20.22.166, 104.20.6.166, 2606:4700:10::6814:6a6, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.20.22.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1235219084 (1.1G) [binary/octet-stream]
Saving to: ‘cc.vi.300.vec.gz’


2019-03-18 07:20:27 (37.9 MB/s) - ‘cc.vi.300.vec.gz’ saved [1235219084/1235219084]



In [None]:
!gunzip cc.vi.300.vec.gz

In [None]:
import gensim

glove_vectors = gensim.models.KeyedVectors.load_word2vec_format('model.txt', binary=False)
fast_text_vectors = gensim.models.KeyedVectors.load_word2vec_format('cc.vi.300.vec', binary=False)

In [None]:
glove_vectors.save("/content/glove_converted")
fast_text_vectors.save("/content/fast_text_converted")

In [None]:
bert_embedding = BertEmbeddings('bert-base-multilingual-cased')

2019-06-01 14:31:05,851 The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


100%|██████████| 995526/995526 [00:00<00:00, 10805196.91B/s]
100%|██████████| 662804195/662804195 [00:09<00:00, 68485635.93B/s]


In [None]:
embedding_vi = BytePairEmbeddings('vi')

downloading https://nlp.h-its.org/bpemb/vi/vi.wiki.bpe.vs100000.model


100%|██████████| 1897265/1897265 [00:00<00:00, 2511577.16B/s]


downloading https://nlp.h-its.org/bpemb/vi/vi.wiki.bpe.vs100000.d50.w2v.bin.tar.gz


100%|██████████| 19293611/19293611 [00:01<00:00, 11153595.12B/s]
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
embedding_gl = WordEmbeddings('glove_converted')

In [None]:
embedding_ft = WordEmbeddings('fast_text_converted')

In [None]:
# The sentence objects holds a sentence that we may want to embed or tag
from flair.data import Sentence

# create a sentence
sentence = Sentence('can ho o ha noi')

# embed words in sentence
embedding_vi.embed(sentence)

# now check out the embedded tokens.
for token in sentence:
    print(token)
    print(token.embedding)

Token: 1 can
tensor([ 0.5459,  0.1899,  0.6332, -0.2153,  0.2570, -0.2100, -0.6654, -0.2748,
         1.1381, -0.4627, -0.0598,  0.2289, -0.5887, -0.2988,  2.0940,  1.1097,
        -0.4223,  0.0666, -0.1137,  0.7464, -0.4807,  0.3184, -0.5454,  0.0989,
         0.1789, -0.5827,  0.5225,  0.4519,  0.8398,  0.6451, -0.1772, -0.0944,
        -0.5121,  0.2777,  0.4311, -0.4081,  0.1555, -0.1418,  0.1689, -0.6162,
        -0.2566,  1.1093,  1.0190,  0.2392, -0.4555,  0.8983,  0.0705,  0.3627,
         0.0489, -0.3109,  0.5459,  0.1899,  0.6332, -0.2153,  0.2570, -0.2100,
        -0.6654, -0.2748,  1.1381, -0.4627, -0.0598,  0.2289, -0.5887, -0.2988,
         2.0940,  1.1097, -0.4223,  0.0666, -0.1137,  0.7464, -0.4807,  0.3184,
        -0.5454,  0.0989,  0.1789, -0.5827,  0.5225,  0.4519,  0.8398,  0.6451,
        -0.1772, -0.0944, -0.5121,  0.2777,  0.4311, -0.4081,  0.1555, -0.1418,
         0.1689, -0.6162, -0.2566,  1.1093,  1.0190,  0.2392, -0.4555,  0.8983,
         0.0705,  0.3627,  

In [9]:
# 4. initialize embeddings
embedding = StackedEmbeddings(
    [
        # Byte pair embeddings for English
        BytePairEmbeddings('vi'),
    ]
)

downloading https://nlp.h-its.org/bpemb/vi/vi.wiki.bpe.vs100000.d50.w2v.bin.tar.gz


100%|██████████| 19293611/19293611 [00:11<00:00, 1668418.51B/s]


In [11]:
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embedding,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [12]:
# 6. initialize trainer
from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [14]:
# 7. start training
trainer.train('models',
              learning_rate=0.1,
              mini_batch_size=64,
              max_epochs=50,
              checkpoint=True)

2021-07-10 08:50:21,355 ----------------------------------------------------------------------------------------------------
2021-07-10 08:50:21,356 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): BytePairEmbeddings(model=0-bpe-vi-100000-50)
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=100, out_features=100, bias=True)
  (rnn): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=6, bias=True)
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2021-07-10 08:50:21,356 ----------------------------------------------------------------------------------------------------
2021-07-10 08:50:21,357 Corpus: "Corpus: 7480 train + 2825 dev + 4264 test sentences"
2021-07-10 08:50:21,358 ----------------------------------------------------------------------------------------------------
2021-07-10 08:50:21,358 Parameters:
2021-07-10 08:

2021-07-10 08:53:32,904 epoch 5 - iter 44/117 - loss 3.17926017 - samples/sec: 161.66 - lr: 0.100000
2021-07-10 08:53:36,116 epoch 5 - iter 55/117 - loss 3.18867266 - samples/sec: 219.35 - lr: 0.100000
2021-07-10 08:53:39,003 epoch 5 - iter 66/117 - loss 3.13964719 - samples/sec: 244.06 - lr: 0.100000
2021-07-10 08:53:41,634 epoch 5 - iter 77/117 - loss 3.06705961 - samples/sec: 267.78 - lr: 0.100000
2021-07-10 08:53:44,943 epoch 5 - iter 88/117 - loss 3.05513040 - samples/sec: 212.83 - lr: 0.100000
2021-07-10 08:53:48,517 epoch 5 - iter 99/117 - loss 2.99399970 - samples/sec: 197.14 - lr: 0.100000
2021-07-10 08:53:51,211 epoch 5 - iter 110/117 - loss 2.93702219 - samples/sec: 261.48 - lr: 0.100000
2021-07-10 08:53:52,941 ----------------------------------------------------------------------------------------------------
2021-07-10 08:53:52,942 EPOCH 5 done: loss 2.9281 - lr 0.1000000
2021-07-10 08:53:59,508 DEV : loss 2.179246664047241 - score 0.7083
2021-07-10 08:53:59,681 BAD EPOCHS

2021-07-10 08:57:18,249 DEV : loss 1.6106880903244019 - score 0.7921
2021-07-10 08:57:18,419 BAD EPOCHS (no improvement): 0
saving best model
2021-07-10 08:57:19,192 ----------------------------------------------------------------------------------------------------
2021-07-10 08:57:22,130 epoch 11 - iter 11/117 - loss 1.69393355 - samples/sec: 239.76 - lr: 0.100000
2021-07-10 08:57:25,287 epoch 11 - iter 22/117 - loss 1.71510774 - samples/sec: 223.21 - lr: 0.100000
2021-07-10 08:57:28,203 epoch 11 - iter 33/117 - loss 1.86532351 - samples/sec: 241.66 - lr: 0.100000
2021-07-10 08:57:30,904 epoch 11 - iter 44/117 - loss 1.85993766 - samples/sec: 260.74 - lr: 0.100000
2021-07-10 08:57:33,844 epoch 11 - iter 55/117 - loss 1.86374970 - samples/sec: 239.66 - lr: 0.100000
2021-07-10 08:57:36,866 epoch 11 - iter 66/117 - loss 1.87792767 - samples/sec: 233.14 - lr: 0.100000
2021-07-10 08:57:39,794 epoch 11 - iter 77/117 - loss 1.86279738 - samples/sec: 240.60 - lr: 0.100000
2021-07-10 08:57:42

2021-07-10 09:00:55,726 epoch 16 - iter 77/117 - loss 1.68979705 - samples/sec: 245.86 - lr: 0.100000
2021-07-10 09:00:58,663 epoch 16 - iter 88/117 - loss 1.67109187 - samples/sec: 239.89 - lr: 0.100000
2021-07-10 09:01:01,607 epoch 16 - iter 99/117 - loss 1.64138184 - samples/sec: 239.37 - lr: 0.100000
2021-07-10 09:01:04,205 epoch 16 - iter 110/117 - loss 1.61781301 - samples/sec: 271.21 - lr: 0.100000
2021-07-10 09:01:06,011 ----------------------------------------------------------------------------------------------------
2021-07-10 09:01:06,013 EPOCH 16 done: loss 1.6075 - lr 0.1000000
2021-07-10 09:01:11,344 DEV : loss 1.3237756490707397 - score 0.837
2021-07-10 09:01:11,517 BAD EPOCHS (no improvement): 0
saving best model
2021-07-10 09:01:12,233 ----------------------------------------------------------------------------------------------------
2021-07-10 09:01:14,941 epoch 17 - iter 11/117 - loss 1.28397739 - samples/sec: 260.12 - lr: 0.100000
2021-07-10 09:01:18,012 epoch 17

2021-07-10 09:04:29,541 epoch 22 - iter 11/117 - loss 1.54885005 - samples/sec: 252.97 - lr: 0.100000
2021-07-10 09:04:33,007 epoch 22 - iter 22/117 - loss 1.55716373 - samples/sec: 203.24 - lr: 0.100000
2021-07-10 09:04:35,691 epoch 22 - iter 33/117 - loss 1.48398282 - samples/sec: 262.46 - lr: 0.100000
2021-07-10 09:04:39,791 epoch 22 - iter 44/117 - loss 1.57961200 - samples/sec: 171.83 - lr: 0.100000
2021-07-10 09:04:42,541 epoch 22 - iter 55/117 - loss 1.51766784 - samples/sec: 256.24 - lr: 0.100000
2021-07-10 09:04:45,798 epoch 22 - iter 66/117 - loss 1.48505803 - samples/sec: 216.28 - lr: 0.100000
2021-07-10 09:04:48,861 epoch 22 - iter 77/117 - loss 1.47765586 - samples/sec: 229.97 - lr: 0.100000
2021-07-10 09:04:51,600 epoch 22 - iter 88/117 - loss 1.43995136 - samples/sec: 257.23 - lr: 0.100000
2021-07-10 09:04:54,561 epoch 22 - iter 99/117 - loss 1.41785565 - samples/sec: 237.97 - lr: 0.100000
2021-07-10 09:04:57,487 epoch 22 - iter 110/117 - loss 1.41015554 - samples/sec: 2

2021-07-10 09:08:11,854 epoch 27 - iter 99/117 - loss 1.24026840 - samples/sec: 264.91 - lr: 0.100000
2021-07-10 09:08:14,551 epoch 27 - iter 110/117 - loss 1.24300004 - samples/sec: 261.25 - lr: 0.100000
2021-07-10 09:08:16,335 ----------------------------------------------------------------------------------------------------
2021-07-10 09:08:16,337 EPOCH 27 done: loss 1.2452 - lr 0.1000000
2021-07-10 09:08:21,925 DEV : loss 1.1508837938308716 - score 0.8619
2021-07-10 09:08:22,103 BAD EPOCHS (no improvement): 1
2021-07-10 09:08:22,462 ----------------------------------------------------------------------------------------------------
2021-07-10 09:08:25,222 epoch 28 - iter 11/117 - loss 1.25122237 - samples/sec: 255.21 - lr: 0.100000
2021-07-10 09:08:28,208 epoch 28 - iter 22/117 - loss 1.26933084 - samples/sec: 235.99 - lr: 0.100000
2021-07-10 09:08:31,140 epoch 28 - iter 33/117 - loss 1.27138173 - samples/sec: 240.25 - lr: 0.100000
2021-07-10 09:08:33,907 epoch 28 - iter 44/117 - 

2021-07-10 09:11:49,183 epoch 33 - iter 44/117 - loss 1.09816128 - samples/sec: 233.76 - lr: 0.100000
2021-07-10 09:11:51,898 epoch 33 - iter 55/117 - loss 1.10559108 - samples/sec: 259.56 - lr: 0.100000
2021-07-10 09:11:54,615 epoch 33 - iter 66/117 - loss 1.10521584 - samples/sec: 259.37 - lr: 0.100000
2021-07-10 09:11:57,579 epoch 33 - iter 77/117 - loss 1.12486590 - samples/sec: 237.71 - lr: 0.100000
2021-07-10 09:12:01,595 epoch 33 - iter 88/117 - loss 1.15169251 - samples/sec: 175.38 - lr: 0.100000
2021-07-10 09:12:04,501 epoch 33 - iter 99/117 - loss 1.15510513 - samples/sec: 242.47 - lr: 0.100000
2021-07-10 09:12:07,580 epoch 33 - iter 110/117 - loss 1.15487593 - samples/sec: 228.85 - lr: 0.100000
2021-07-10 09:12:09,458 ----------------------------------------------------------------------------------------------------
2021-07-10 09:12:09,460 EPOCH 33 done: loss 1.1490 - lr 0.1000000
2021-07-10 09:12:14,801 DEV : loss 1.0621129274368286 - score 0.8708
2021-07-10 09:12:14,970 B

2021-07-10 09:15:35,319 DEV : loss 1.0980778932571411 - score 0.8737
2021-07-10 09:15:35,494 BAD EPOCHS (no improvement): 3
2021-07-10 09:15:35,855 ----------------------------------------------------------------------------------------------------
2021-07-10 09:15:39,180 epoch 39 - iter 11/117 - loss 0.82156320 - samples/sec: 211.89 - lr: 0.100000
2021-07-10 09:15:43,228 epoch 39 - iter 22/117 - loss 0.96510713 - samples/sec: 173.99 - lr: 0.100000
2021-07-10 09:15:45,902 epoch 39 - iter 33/117 - loss 1.01865439 - samples/sec: 263.55 - lr: 0.100000
2021-07-10 09:15:48,478 epoch 39 - iter 44/117 - loss 1.00702580 - samples/sec: 273.60 - lr: 0.100000
2021-07-10 09:15:51,451 epoch 39 - iter 55/117 - loss 1.02741373 - samples/sec: 236.91 - lr: 0.100000
2021-07-10 09:15:54,228 epoch 39 - iter 66/117 - loss 1.04042400 - samples/sec: 253.76 - lr: 0.100000
2021-07-10 09:15:56,831 epoch 39 - iter 77/117 - loss 1.03019611 - samples/sec: 270.72 - lr: 0.100000
2021-07-10 09:15:59,525 epoch 39 - it

2021-07-10 09:19:08,245 epoch 44 - iter 77/117 - loss 0.88422313 - samples/sec: 250.24 - lr: 0.050000
2021-07-10 09:19:10,911 epoch 44 - iter 88/117 - loss 0.87628875 - samples/sec: 264.30 - lr: 0.050000
2021-07-10 09:19:13,811 epoch 44 - iter 99/117 - loss 0.88222864 - samples/sec: 242.93 - lr: 0.050000
2021-07-10 09:19:16,555 epoch 44 - iter 110/117 - loss 0.89558820 - samples/sec: 256.77 - lr: 0.050000
2021-07-10 09:19:18,139 ----------------------------------------------------------------------------------------------------
2021-07-10 09:19:18,140 EPOCH 44 done: loss 0.8895 - lr 0.0500000
2021-07-10 09:19:24,489 DEV : loss 0.9321224093437195 - score 0.883
Epoch    44: reducing learning rate of group 0 to 2.5000e-02.
2021-07-10 09:19:24,661 BAD EPOCHS (no improvement): 4
2021-07-10 09:19:25,033 ----------------------------------------------------------------------------------------------------
2021-07-10 09:19:28,205 epoch 45 - iter 11/117 - loss 0.76399803 - samples/sec: 222.10 - l

2021-07-10 09:22:42,651 epoch 50 - iter 11/117 - loss 0.96231608 - samples/sec: 249.68 - lr: 0.025000
2021-07-10 09:22:45,482 epoch 50 - iter 22/117 - loss 0.89811014 - samples/sec: 248.89 - lr: 0.025000
2021-07-10 09:22:48,740 epoch 50 - iter 33/117 - loss 0.85097809 - samples/sec: 216.29 - lr: 0.025000
2021-07-10 09:22:51,781 epoch 50 - iter 44/117 - loss 0.85407800 - samples/sec: 231.65 - lr: 0.025000
2021-07-10 09:22:55,986 epoch 50 - iter 55/117 - loss 0.89236710 - samples/sec: 167.53 - lr: 0.025000
2021-07-10 09:22:58,744 epoch 50 - iter 66/117 - loss 0.88866591 - samples/sec: 255.51 - lr: 0.025000
2021-07-10 09:23:01,778 epoch 50 - iter 77/117 - loss 0.87614965 - samples/sec: 232.21 - lr: 0.025000
2021-07-10 09:23:04,719 epoch 50 - iter 88/117 - loss 0.86035477 - samples/sec: 239.53 - lr: 0.025000
2021-07-10 09:23:07,666 epoch 50 - iter 99/117 - loss 0.86340737 - samples/sec: 239.09 - lr: 0.025000
2021-07-10 09:23:10,404 epoch 50 - iter 110/117 - loss 0.85874142 - samples/sec: 2

{'test_score': 0.8785168623066889,
 'dev_score_history': [0.3041825095057034,
  0.3635334088335221,
  0.6152200080246087,
  0.623059866962306,
  0.7083156254426971,
  0.7379407616361072,
  0.7588555858310626,
  0.7737365481542025,
  0.7524156104906513,
  0.7921212980019838,
  0.8072141002869245,
  0.8109660574412534,
  0.8054004933142932,
  0.7966027236784303,
  0.8277544382707683,
  0.837034046561701,
  0.8228730822873083,
  0.8472867188536553,
  0.8403409090909091,
  0.8135506223533939,
  0.8539236479321316,
  0.8545046827595304,
  0.8557496952458349,
  0.859275342651649,
  0.8621747831887925,
  0.8673251468232782,
  0.8618740964647129,
  0.8647786198643796,
  0.8668076109936576,
  0.8688891817103702,
  0.8726541554959786,
  0.8693554870695431,
  0.8708399366085579,
  0.8695192815636555,
  0.8769106999195494,
  0.8753215107621498,
  0.8725062755978333,
  0.873686974789916,
  0.876398578386205,
  0.883986710963455,
  0.8813738441215324,
  0.8832446808510639,
  0.8809272918861959,
  0.

In [None]:
!wget http://download1515.mediafire.com/k0bh2w3sdsig/59j8zw0v5opg1p0/final-model.pt

--2019-06-17 13:55:29--  http://download1515.mediafire.com/k0bh2w3sdsig/59j8zw0v5opg1p0/final-model.pt
Resolving download1515.mediafire.com (download1515.mediafire.com)... 205.196.123.203
Connecting to download1515.mediafire.com (download1515.mediafire.com)|205.196.123.203|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28592159 (27M) [application/octet-stream]
Saving to: ‘final-model.pt’


2019-06-17 13:55:32 (8.78 MB/s) - ‘final-model.pt’ saved [28592159/28592159]



In [None]:
from flair.data import Sentence

# load the model you trained
model = SequenceTagger.load('final-model.pt')

# create example sentence
sentence = Sentence('nhà ơ cầu giấy')

# predict tags and print
model.predict(sentence)

result=sentence.to_dict(tag_type='ner')

#result=sentence.to_tagged_string()

print(result)

2019-06-17 13:56:34,746 loading file final-model.pt
{'text': 'nhà ơ cầu giấy', 'labels': [], 'entities': [{'text': 'cầu giấy', 'start_pos': 6, 'end_pos': 14, 'type': 'LOC', 'confidence': 0.7378469407558441}]}


In [None]:
from pprint import pprint
for entity in sentence.get_spans('ner'):
  #pprint(vars(entity.tokens[0].tags['ner']))
  print(entity)

LOC-span [5,6,7]: "xã Mỹ Quý"
LOC-span [8,9,10]: "huyện Tháp Mười"
LOC-span [11,12]: "Đồng Tháp"


In [None]:
f = open('test_kewords.txt', 'r').read()

for line in f.split('\n'):
  #print(line)
  sentence = Sentence(line)
  model.predict(sentence)
  result=sentence.to_dict(tag_type='ner')
  txt=result['text'].strip().split(' ')
  words=[x+'\tO' for x in txt]
  
  fw = open('test_flair_byte_eb.txt', 'a')
  for entity in sentence.get_spans('ner'):
    for tk in entity.tokens:
      words[tk.idx-1]=tk.text+'\t'+tk.tags['ner']._value
          

  for w in words:
    fw.write(w)
    fw.write("\n")
  fw.write("\n")

In [None]:
f1=open('test_labeled.txt', 'r').read()
#f2=open('test_anago.txt', 'r').read()
f3=open('test_flair_byte_eb.txt', 'r').read()

l1=f1.split('\n')
i2=0
k2=0
i3=0
k3=0
#for j,l2 in enumerate(f2.split('\n')):
#  if(l2==l1[j]):
#    i2+=1
#  else:
#    k2+=1
    
for j,l3 in enumerate(f3.split('\n')):
  if(l3==l1[j]):
    i3+=1
  else:
    k3+=1

#t2=i2/(k2+i2)
t3=i3/(k3+i3)

#print(t2)
print(t3)

In [1]:
!pip install bpemb

Collecting bpemb
  Downloading https://files.pythonhosted.org/packages/f2/6f/9191b85109772636a8f8accb122900c34db26c091d2793218aa94954524c/bpemb-0.3.3-py3-none-any.whl
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/ac/aa/1437691b0c7c83086ebb79ce2da16e00bef024f24fec2a5161c35476f499/sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 9.1MB/s 
Installing collected packages: sentencepiece, bpemb
Successfully installed bpemb-0.3.3 sentencepiece-0.1.96


In [6]:
from bpemb import BPEmb
bpemb_vi = BPEmb(lang="vi", dim=100)

In [12]:
text = "tôi muốzzzn gặp bác sĩ nam"
bpemb_vi.encode(text)

['▁tôi', '▁mu', 'ố', 'zz', 'z', 'n', '▁gặp', '▁bác', '▁sĩ', '▁nam']

In [14]:
bpemb_vi.embed(text).shape

(10, 100)