In [4]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings , CharacterEmbeddings , FlairEmbeddings , BertEmbeddings , OpenAIGPT2Embeddings
from typing import List


In [5]:
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '/home/raghavan/projects/transformers'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')

2019-11-25 20:26:26,955 Reading data from /home/raghavan/projects/transformers
2019-11-25 20:26:26,957 Train: /home/raghavan/projects/transformers/train.txt
2019-11-25 20:26:26,957 Dev: /home/raghavan/projects/transformers/dev.txt
2019-11-25 20:26:26,958 Test: /home/raghavan/projects/transformers/test.txt


In [6]:
print(corpus)

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)


embedding_types: List[TokenEmbeddings] = [

    WordEmbeddings('glove'),

    # comment in this line to use character embeddings
    CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
    BertEmbeddings('bert-large-uncased'),
    OpenAIGPT2Embeddings()
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('resources/taggers/example-ner',              
              learning_rate=0.1,
              mini_batch_size=6,
              max_epochs=150)

Corpus: 11911 train + 2883 dev + 3051 test sentences
[b'<unk>', b'O', b'I', b'', b'<START>', b'<STOP>']


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
100%|██████████| 1042301/1042301 [00:01<00:00, 716780.71B/s]
100%|██████████| 456318/456318 [00:00<00:00, 472216.61B/s]
100%|██████████| 293/293 [00:00<00:00, 207519.60B/s]
100%|██████████| 1520013706/1520013706 [02:20<00:00, 10848677.19B/s]


2019-11-25 20:29:18,209 ----------------------------------------------------------------------------------------------------
2019-11-25 20:29:18,214 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
    (list_embedding_1): CharacterEmbeddings(
      (char_embedding): Embedding(275, 25)
      (char_rnn): LSTM(25, 25, bidirectional=True)
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_3): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_4): BertEmbeddings(
      (model): 

IndexError: index 0 is out of bounds for dimension 0 with size 0