# Walktrough Flair Framework using Conll_03 as dataset + different embeddings

## Embeddings:


*   FastText
*   BytePair
*   Flair
*   PooledFlair
*   BERT base, Multilingual, German (deepset.ai), GERMAN (dbmdz)
*   Hot One
*   XLM German+English (CLM/-)
*   DistilBERT German, Multilingual






## 1. Install and Imports


In [0]:
# install flair
pip install flair

In [0]:
# see if flair is installed
!pip show flair

In [0]:
# imports
from flair.data import Corpus
from flair.datasets import CONLL_03_GERMAN, ColumnCorpus, GERMEVAL 
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings, OneHotEmbeddings, BertEmbeddings, XLMEmbeddings, BytePairEmbeddings
from flair.visual.training_curves import Plotter
from flair.trainers import ModelTrainer
from flair.models import SequenceTagger
from typing import List

## 2. Set the corpus


### Conll_03
#### Formatted in BIOES-Format



In [0]:
columns = {0: 'text', 1: 'ner'}
data_folder = 'resources/tasks/conll_03_german'

corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='deuBIOES.train',
                              test_file='deuBIOES.testa',
                              dev_file='deuBIOES.testb')

### GermEval_14
#### Formatted in BIOES-Format



In [0]:
columns = {0: 'text', 1: 'ner'}
data_folder = 'resources/tasks/germeval'

corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='trainBIOES.train',
                              test_file='testBIOES.test',
                              dev_file='devBIOES.dev')

### Own Corpus


In [0]:
columns = {0: 'text', 1: 'ner'}
data_folder = '/content/drive/My Drive/resources/tasks/conll_03_german'

corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='deuBIOES.train',
                              test_file='deuBIOES.testa',
                              dev_file='deuBIOES.testb')

### Define Task

In [0]:
# define task + tag dict.
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)

## 3. Set Embeddings

### Word (fastext) + BytePairEmbeddings Embeddings
#### BytePair for oov-functionality 


In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),
                                         BytePairEmbeddings('de')                                                 
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext over crawl) + BytePairEmbeddings Embeddings



In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de-crawl'),
                                         BytePairEmbeddings('de')                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + Flair Embeddings

In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         FlairEmbeddings('de-forward'),
                                         FlairEmbeddings('de-backward'),                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + Flair (de-history) Embeddings 
#### Hamburger Anzeiger

In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         FlairEmbeddings('de-historic-ha-forward'),
                                         FlairEmbeddings('de-historic-ha-backward'),                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + Flair (de-history) Embeddings 
#### Wiener Zeitung

In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         FlairEmbeddings('de-historic-wz-forward'),
                                         FlairEmbeddings('de-historic-wz-backward'),                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + PooledFlair Embeddings
#### Best known configuration

In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         PooledFlairEmbeddings('german-forward'),
                                         PooledFlairEmbeddings('german-backward'),                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### PooledFlair Embeddings + dbmdz-BERT-german-cased Embeddings


In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         PooledFlairEmbeddings('german-forward'),
                                         PooledFlairEmbeddings('german-backward'),
                                          
                                         BertEmbeddings('bert-base-german-dbmdz-cased')
                                                                                             
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + dbmdz-BERT-german-cased Embeddings


In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-german-dbmdz-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + dbmdz-BERT-german-uncased Embeddings


In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-german-dbmdz-uncased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + BERT-base-cased Embeddings

In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + BERT-base-uncased Embeddings

In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-uncased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + BERT-multilingual-cased Embeddings

In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-multilingual-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + BERT-multilingual-uncased Embeddings

In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-multilingual-uncased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + BERT-base-german-cased


In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-german-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + distilbert-base-german-cased

In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('distilbert-base-german-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fastext) + distilbert-base-multilingual-cased

In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('distilbert-base-multilingual-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### One Hot Embedding

In [0]:
# load corpus
corpus = "Own Corpus"

# embed NER tags
embeddings = OneHotEmbeddings(corpus=corpus, field='ner')

### Word (fasttext) + XLM (german + english)
#### XLM English-German model trained on the concatenation of English and German wikipedia < 6-layer, 1024-hidden, 8-heads >


In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         XLMEmbeddings('xlm-mlm-ende-1024'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + XLM (german + english)
#### XLM English-German model trained with CLM (Causal Language Modeling) on the concatenation of English and German wikipedia < 6-layer, 1024-hidden, 8-heads >


In [0]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         XLMEmbeddings('xlm-clm-ende-1024'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

## 4. Set the remaining params and training

In [0]:
# initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [0]:
# initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)


In [0]:
# start training 
trainer.train('resources/taggers/example-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=40,
              train_with_dev='True',
              checkpoint=True, 
              embeddings_storage_mode='gpu')

2020-04-17 11:12:33,145 ----------------------------------------------------------------------------------------------------
2020-04-17 11:12:33,150 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('de')
    (list_embedding_1): BertEmbeddings(
      (model): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(31102, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0): BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True)

In [0]:
# plot weight traces 
plotter = Plotter()
plotter.plot_weights('resources/taggers/example-ner/weights.txt')