<a href="https://colab.research.google.com/github/pascalhuszar/ner-benchmark/blob/master/Flair%20trained%20on%20multiple%20embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Walktrough Flair Framework using Conll_03, GermEval_14 as datasets + different embeddings

## Embeddings:


*   FastText
*   BytePair
*   Flair
*   PooledFlair
*   BERT base, Multilingual, German (deepset.ai), GERMAN (dbmdz)
*   DistilBERT German, Multilingual






## 1. Install and Imports


In [None]:
# if data is on drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# install flair
!pip install flair

In [None]:
# see if flair is installed
!pip show flair

In [None]:
# imports
from flair.data import Corpus
from flair.datasets import CONLL_03_GERMAN, ColumnCorpus 
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings, OneHotEmbeddings, TransformerEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, CharacterEmbeddings
from flair.visual.training_curves import Plotter
from flair.trainers import ModelTrainer
from flair.models import SequenceTagger
from typing import List

## 2. Set the corpus


### GermEval_14
#### Formatted in BIOES-Format



In [None]:
columns = {0: 'text', 1: 'ner'}
# change path  
data_folder = '/content/drive/My Drive/resources/tasks/germeval/BIOES'

corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train_bioes.train',
                              test_file='test_bioes.test',
                              dev_file='dev_bioes.dev')

### Conll_03
#### delete tag_to_bioes if BIO/IOB format is desired


In [None]:
columns = {0: 'text', 2: 'pos', 4: 'ner'}
# change path
data_folder = '/content/drive/My Drive/resources/tasks/conll_03_german/BIO + POS'

corpus: Corpus = ColumnCorpus(data_folder, columns,
                              tag_to_bioes='ner',
                              train_file='train.train',
                              dev_file='dev.dev',
                              test_file='testa.test')

## 3. Define Task

In [None]:
# define task + tag dict.
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary )
#print(corpus.train[2].to_tagged_string('ner'))

## 4. Set Embeddings

### Word (fastext) + BytePairEmbeddings Embeddings
#### BytePair for oov-functionality 


In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),
                                         BytePairEmbeddings('de')                                                 
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext over crawl) + BytePairEmbeddings Embeddings




In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de-crawl'),
                                         BytePairEmbeddings('de')                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + Flair Embeddings

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         FlairEmbeddings('de-forward'),
                                         FlairEmbeddings('de-backward'),                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + Flair (de-history) Embeddings 
#### Hamburger Anzeiger

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         FlairEmbeddings('de-historic-ha-forward'),
                                         FlairEmbeddings('de-historic-ha-backward'),                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + Flair (de-history) Embeddings 
#### Wiener Zeitung

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         FlairEmbeddings('de-historic-wz-forward'),
                                         FlairEmbeddings('de-historic-wz-backward'),                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + Pooled Flair Embeddings
#### Best known configuration

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         PooledFlairEmbeddings('german-forward'),
                                         PooledFlairEmbeddings('german-backward'),                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + CharacterEmbeddings + Flair Embeddings



In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),
                                         CharacterEmbeddings(),
                                         FlairEmbeddings('de-forward'),
                                         FlairEmbeddings('de-backward'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + CharacterEmbeddings + Pooled Flair Embeddings 




In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),
                                         CharacterEmbeddings(),
                                         PooledFlairEmbeddings('german-forward'),
                                         PooledFlairEmbeddings('german-backward'), 
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### PooledFlair Embeddings + dbmdz-BERT-german-cased Embeddings


In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         PooledFlairEmbeddings('german-forward'),
                                         PooledFlairEmbeddings('german-backward'),
                                          
                                         TransformerEmbeddings('bert-base-german-dbmdz-cased')
                                                                                             
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + dbmdz-BERT-german-cased Embeddings


In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         TransformerEmbeddings('bert-base-german-dbmdz-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + dbmdz-BERT-german-uncased Embeddings


In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         TransformerEmbeddings('bert-base-german-dbmdz-uncased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + BERT-base-cased Embeddings

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         TransformerEmbeddings('bert-base-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + BERT-base-uncased Embeddings

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         TransformerEmbeddings('bert-base-uncased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + BERT-multilingual-cased Embeddings

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         TransformerEmbeddings('bert-base-multilingual-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + BERT-multilingual-uncased Embeddings

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         TransformerEmbeddings('bert-base-multilingual-uncased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + BERT-base-german-cased


In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         TransformerEmbeddings('bert-base-german-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + distilbert-base-german-cased

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         TransformerEmbeddings('distilbert-base-german-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fastext) + distilbert-base-multilingual-cased

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         TransformerEmbeddings('distilbert-base-multilingual-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

## 5. Set the remaining params and training

In [None]:
# initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
# initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)


In [None]:
# start training, uncomment checkpoint if training should be pausable
trainer.train('resources/taggers/example-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150,
              #checkpoint=True, 
              train_with_dev='True',
              embeddings_storage_mode='gpu')

In [None]:
# restart training at any time
from pathlib import Path
# chnage path
checkpoint = '/content/drive/My Drive/resources/checkpoint.pt'
trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)
trainer.train('resources/taggers/example-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              train_with_dev='True',
              max_epochs=?,
              checkpoint=True, 
              embeddings_storage_mode='gpu')

In [None]:
# plot weight traces 
plotter = Plotter()
plotter.plot_weights('resources/taggers/example-ner/weights.txt')