# IIC-3670 NLP UC

- Versiones de librerías, python 3.8.10

- numpy 1.20.3
- flair 0.12
- allennlp==0.9.0


!pip3 install flair

In [1]:
from flair.models import SequenceTagger
from flair.data import Sentence

ner_tagger = SequenceTagger.load('ner')
pos_tagger = SequenceTagger.load('pos')

2023-05-11 09:46:09,529 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
2023-05-11 09:46:14,060 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD


In [2]:
sentence = Sentence('George Washington was born in Washington')

ner_tagger.predict(sentence)

print(sentence.to_tagged_string())

Sentence[6]: "George Washington was born in Washington" → ["George Washington"/PER, "Washington"/LOC]


In [3]:
for entity in sentence.get_spans('ner'):
  print(entity.text , entity.tag)

George Washington PER
Washington LOC


In [4]:
pos_tagger.predict(sentence)

for token in sentence:
   print(token.text, token.tag)

George NNP
Washington NNP
was VBD
born VBN
in IN
Washington NNP


In [5]:
from flair.embeddings import FlairEmbeddings

flair_embedding_forward = FlairEmbeddings('spanish-forward')

sentence = Sentence('Whitney tiene una historia llena de drama')

flair_embedding_forward.embed(sentence)

for token in sentence:
    print(token)
    print(token.embedding)

Token[0]: "Whitney"
tensor([-8.6821e-02,  9.3834e-04,  9.3298e-02,  ..., -1.4167e-05,
         8.4766e-03,  6.6558e-05])
Token[1]: "tiene"
tensor([ 1.7966e-02, -4.1171e-04,  1.8595e-03,  ..., -2.3416e-05,
         2.2486e-04,  3.8149e-04])
Token[2]: "una"
tensor([-2.9416e-02,  2.5448e-04,  2.6700e-02,  ..., -6.4758e-06,
         9.1020e-04,  6.0708e-05])
Token[3]: "historia"
tensor([ 3.4301e-02,  4.8836e-03,  4.0833e-02,  ..., -1.0331e-05,
         1.0874e-02, -4.4025e-03])
Token[4]: "llena"
tensor([-6.9203e-02, -2.8580e-03,  9.7360e-03,  ..., -2.5984e-05,
         3.8512e-03, -2.7470e-03])
Token[5]: "de"
tensor([ 3.7302e-02,  4.7254e-04, -3.0727e-03,  ..., -3.0066e-05,
         3.3274e-03, -7.5158e-04])
Token[6]: "drama"
tensor([-1.2064e-01,  2.6104e-04,  3.7579e-02,  ..., -2.7386e-05,
        -9.2416e-03, -5.6781e-03])


Ver más embeddings en: https://flairnlp.github.io/docs/tutorial-embeddings/flair-embeddings

In [6]:
sentence = Sentence('La historia sin fin es una pelicula de los 80s')

flair_embedding_forward.embed(sentence)

for token in sentence:
    print(token)
    print(token.embedding)

Token[0]: "La"
tensor([-1.6193e-02,  1.4907e-03,  1.2598e-02,  ..., -2.3243e-06,
         5.4600e-03,  1.7847e-04])
Token[1]: "historia"
tensor([ 1.5450e-01,  2.0388e-03,  3.9008e-02,  ..., -1.1207e-05,
         2.8937e-02, -1.5640e-03])
Token[2]: "sin"
tensor([-6.0255e-02,  1.3912e-02,  1.4367e-01,  ..., -2.9156e-05,
         7.4421e-04,  1.0845e-02])
Token[3]: "fin"
tensor([ 2.5528e-02,  9.2127e-03,  3.5452e-02,  ..., -6.7441e-05,
         7.9362e-02,  6.7573e-04])
Token[4]: "es"
tensor([-8.8894e-03,  2.8105e-03,  4.4454e-04,  ..., -8.8751e-06,
         6.1890e-04,  1.3850e-04])
Token[5]: "una"
tensor([-1.9456e-02,  1.2340e-04,  3.7264e-03,  ..., -8.1354e-06,
         3.1620e-03, -6.0251e-04])
Token[6]: "pelicula"
tensor([-1.0766e-01, -5.2601e-04,  4.8633e-02,  ..., -7.0584e-06,
         2.1830e-02, -1.1417e-03])
Token[7]: "de"
tensor([ 8.0026e-02, -7.6142e-04,  1.2470e-02,  ..., -6.7769e-06,
         4.7873e-02, -1.6199e-03])
Token[8]: "los"
tensor([-8.8328e-03,  1.5163e-03,  2.0621

Training

In [14]:
import flair.datasets

corpus = flair.datasets.NER_ENGLISH_SEC_FILLINGS()
print(corpus)

2023-04-12 14:28:37,727 Reading data from C:\Users\marce\.flair\datasets\ner_english_sec_fillings
2023-04-12 14:28:37,728 Train: C:\Users\marce\.flair\datasets\ner_english_sec_fillings\FIN5.txt
2023-04-12 14:28:37,729 Dev: None
2023-04-12 14:28:37,729 Test: C:\Users\marce\.flair\datasets\ner_english_sec_fillings\FIN3.txt
Corpus: 1051 train + 117 dev + 305 test sentences


Ver mas corpus en: https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_CORPUS_PREPARED.md

In [15]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

label_type = 'ner'

label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
print(label_dict)

embedding_types = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

2023-04-12 14:28:42,974 Computing label dictionary. Progress:


1051it [00:00, 130020.45it/s]

2023-04-12 14:28:42,985 Dictionary created for label 'ner' with 4 values: PER (seen 691 times), ORG (seen 219 times), LOC (seen 158 times), MISC (seen 7 times)
Dictionary with 4 tags: PER, ORG, LOC, MISC





2023-04-12 14:28:44,123 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim.vectors.npy not found in cache, downloading to C:\Users\marce\AppData\Local\Temp\tmpc1tg25dy


100%|███████████████████████████████████████████████████████████████████████████| 153M/153M [01:13<00:00, 2.19MB/s]

2023-04-12 14:29:57,854 copying C:\Users\marce\AppData\Local\Temp\tmpc1tg25dy to cache at C:\Users\marce\.flair\embeddings\glove.gensim.vectors.npy





2023-04-12 14:29:57,930 removing temp file C:\Users\marce\AppData\Local\Temp\tmpc1tg25dy
2023-04-12 14:29:58,592 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim not found in cache, downloading to C:\Users\marce\AppData\Local\Temp\tmpl9o4iykk


100%|█████████████████████████████████████████████████████████████████████████| 20.5M/20.5M [00:16<00:00, 1.34MB/s]

2023-04-12 14:30:15,356 copying C:\Users\marce\AppData\Local\Temp\tmpl9o4iykk to cache at C:\Users\marce\.flair\embeddings\glove.gensim
2023-04-12 14:30:15,359 removing temp file C:\Users\marce\AppData\Local\Temp\tmpl9o4iykk





2023-04-12 14:30:17,971 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-backward-0.4.1.pt not found in cache, downloading to C:\Users\marce\AppData\Local\Temp\tmplnxin_lk


100%|█████████████████████████████████████████████████████████████████████████| 69.7M/69.7M [00:42<00:00, 1.74MB/s]

2023-04-12 14:31:00,643 copying C:\Users\marce\AppData\Local\Temp\tmplnxin_lk to cache at C:\Users\marce\.flair\embeddings\news-backward-0.4.1.pt





2023-04-12 14:31:00,660 removing temp file C:\Users\marce\AppData\Local\Temp\tmplnxin_lk


In [16]:
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type
                        )

2023-04-12 14:31:00,806 SequenceTagger predicts: Dictionary with 17 tags: O, S-PER, B-PER, E-PER, I-PER, S-ORG, B-ORG, E-ORG, I-ORG, S-LOC, B-LOC, E-LOC, I-LOC, S-MISC, B-MISC, E-MISC, I-MISC


Ver documentación en: https://github.com/flairNLP/flair/blob/master/flair/models/sequence_tagger_model.py

In [17]:
trainer = ModelTrainer(tagger, corpus)

In [18]:
trainer.fine_tune('resources/taggers/sota-ner-flair',
                  learning_rate=0.1,
                  mini_batch_size=4,
                  mini_batch_chunk_size=1,
                  max_epochs=10,
                  )

2023-04-12 14:31:26,676 ----------------------------------------------------------------------------------------------------
2023-04-12 14:31:26,677 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, bias=True)
  (rnn): LSTM(4196, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=19, bias=True)
  (loss_f

100%|██████████████████████████████████████████████████████████████████████████████| 30/30 [00:46<00:00,  1.55s/it]

2023-04-12 14:42:27,281 Evaluating as a multi-label problem: False
2023-04-12 14:42:27,287 DEV : loss 1.5489709377288818 - f1-score (micro avg)  0.098
2023-04-12 14:42:27,287 ----------------------------------------------------------------------------------------------------





2023-04-12 14:43:18,945 ----------------------------------------------------------------------------------------------------
2023-04-12 14:43:18,945 Exiting from training early.
2023-04-12 14:43:18,945 Saving model ...
2023-04-12 14:43:19,730 Done.
2023-04-12 14:43:19,746 ----------------------------------------------------------------------------------------------------
2023-04-12 14:43:19,747 Testing using last state of model ...


100%|██████████████████████████████████████████████████████████████████████████████| 77/77 [02:14<00:00,  1.75s/it]

2023-04-12 14:45:34,450 Evaluating as a multi-label problem: False
2023-04-12 14:45:34,457 0.6782	0.6164	0.6458	0.4769
2023-04-12 14:45:34,458 
Results:
- F-score (micro) 0.6458
- F-score (macro) 0.1941
- Accuracy 0.4769

By class:
              precision    recall  f1-score   support

         PER     0.6782    0.9074    0.7762       216
         ORG     0.0000    0.0000    0.0000        56
         LOC     0.0000    0.0000    0.0000        39
        MISC     0.0000    0.0000    0.0000         7

   micro avg     0.6782    0.6164    0.6458       318
   macro avg     0.1696    0.2269    0.1941       318
weighted avg     0.4607    0.6164    0.5273       318

2023-04-12 14:45:34,458 ----------------------------------------------------------------------------------------------------





{'test_score': 0.645799011532125,
 'dev_score_history': [0.09795191451469279],
 'train_loss_history': [0.5053509612491174],
 'dev_loss_history': [1.5489709377288818]}