In [1]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0: 'text', 2: 'pos', 3: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = 'ftb/'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='ftb6_train.conll',
                              test_file='ftb6_test.conll',
                              dev_file='ftb6_dev.conll')

2021-12-01 23:11:34,522 Reading data from ftb
2021-12-01 23:11:34,523 Train: ftb/ftb6_train.conll
2021-12-01 23:11:34,523 Dev: ftb/ftb6_dev.conll
2021-12-01 23:11:34,523 Test: ftb/ftb6_test.conll


In [2]:
len(corpus.train)

9881

In [3]:
print(corpus.train[0].to_tagged_string('ner'))

Certes , rien ne dit qu' une seconde motion de censure sur son projet de loi , reprenant l' accord du 10 avril , n' aurait pas été la bonne mais cette probabilité , reconnaissent les socialistes , n' était pas la plus plausible .


In [None]:
from flair.embeddings import TransformerWordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

# 2. what label do we want to predict?
label_type = 'ner'

# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

# 4. initialize fine-tuneable transformer embeddings WITH document context
embeddings = TransformerWordEmbeddings(model='camembert-base',
                                       layers="-1",
                                       subtoken_pooling="mean",
                                       fine_tune=True,
                                       use_context=True,
                                       )

# 5. initialize bare-bones sequence tagger (no CRF, no RNN, no reprojection)
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type='ner',
                        use_crf=False,
                        use_rnn=False,
                        reproject_embeddings=False,
                        )

# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

# 7. run fine-tuning
trainer.fine_tune('resources/taggers/ner-transformer-ftb',
                  learning_rate=5.0e-6,
                  mini_batch_size=4,
                  )

2021-12-01 23:14:05,277 Computing label dictionary. Progress:


100%|██████████| 9881/9881 [00:00<00:00, 17116.41it/s]

2021-12-01 23:14:05,857 Corpus contains the labels: pos (#278083), ner (#278083)
2021-12-01 23:14:05,857 Created (for label 'ner') Dictionary with 16 tags: <unk>, O, B-Organization, I-Organization, B-Person, I-Person, B-Location, B-Company, I-Company, B-FictionCharacter, B-Product, I-Location, B-POI, I-POI, I-Product, I-FictionCharacter





Dictionary with 16 tags: <unk>, O, B-Organization, I-Organization, B-Person, I-Person, B-Location, B-Company, I-Company, B-FictionCharacter, B-Product, I-Location, B-POI, I-POI, I-Product, I-FictionCharacter
2021-12-01 23:14:10,427 ----------------------------------------------------------------------------------------------------
2021-12-01 23:14:10,429 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): CamembertModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(32005, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (