<a href="https://colab.research.google.com/github/pascalhuszar/flair_notebook/blob/master/Flair_fintuned_Conll_03_different_embedd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Walktrough Flair Framework using Conll_03, GermEval_14 as datasets + different embeddings

## Embeddings:


*   FastText
*   BytePair
*   Flair
*   PooledFlair
*   BERT base, Multilingual, German (deepset.ai), GERMAN (dbmdz)
*   Hot One
*   XLM German+English (CLM/-)
*   DistilBERT German, Multilingual






## 1. Install and Imports


In [None]:
# if data is on drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# install flair
!pip install flair

In [None]:
# see if flair is installed
!pip show flair

In [None]:
# imports
from flair.data import Corpus
from flair.datasets import CONLL_03_GERMAN, ColumnCorpus 
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings, OneHotEmbeddings, BertEmbeddings, XLMEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, CharacterEmbeddings
from flair.visual.training_curves import Plotter
from flair.trainers import ModelTrainer
from flair.models import SequenceTagger
from typing import List

## 2. Set the corpus


### GermEval_14
#### Formatted in BIOES-Format



In [None]:
columns = {0: 'text', 1: 'ner'}
data_folder = '/content/drive/My Drive/resources/tasks/conll_03_german/BIOES'

corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='trainBIOES.train',
                              test_file='testBIOES.test',
                              dev_file='devBIOES.dev')

### Conll_03
#### delete tag_to_bioes if BIO/IOB format is desired


In [None]:
columns = {0: 'text', 2: 'pos', 4: 'ner'}
data_folder = '/content/drive/My Drive/resources/tasks/conll_03_german/BIO + POS'

corpus: Corpus = ColumnCorpus(data_folder, columns,
                              tag_to_bioes='ner',
                              train_file='deu.train',
                              dev_file='deu.dev',
                              test_file='deu.testa')

## 3. Define Task

In [None]:
# define task + tag dict.
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary )
#print(corpus.train[2].to_tagged_string('ner'))

## 4. Set Embeddings

### ~ Word (fastext) + BytePairEmbeddings Embeddings
#### BytePair for oov-functionality 


In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),
                                         BytePairEmbeddings('de')                                                 
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### ~ Word (fasttext over crawl) + BytePairEmbeddings Embeddings




In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de-crawl'),
                                         BytePairEmbeddings('de')                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### ~ Word (fasttext) + Flair Embeddings

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         FlairEmbeddings('de-forward'),
                                         FlairEmbeddings('de-backward'),                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### ~ Word (fasttext) + Flair (de-history) Embeddings 
#### Hamburger Anzeiger

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         FlairEmbeddings('de-historic-ha-forward'),
                                         FlairEmbeddings('de-historic-ha-backward'),                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### ~ Word (fasttext) + Flair (de-history) Embeddings 
#### Wiener Zeitung

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         FlairEmbeddings('de-historic-wz-forward'),
                                         FlairEmbeddings('de-historic-wz-backward'),                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

2020-04-21 08:54:39,136 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/de-wiki-fasttext-300d-1M.vectors.npy not found in cache, downloading to /tmp/tmpq7wvpjql


100%|██████████| 1199998928/1199998928 [00:16<00:00, 71006749.88B/s]

2020-04-21 08:54:56,296 copying /tmp/tmpq7wvpjql to cache at /root/.flair/embeddings/de-wiki-fasttext-300d-1M.vectors.npy





2020-04-21 08:55:07,906 removing temp file /tmp/tmpq7wvpjql
2020-04-21 08:55:08,091 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/de-wiki-fasttext-300d-1M not found in cache, downloading to /tmp/tmpehremsz6


100%|██████████| 42184395/42184395 [00:04<00:00, 9377676.64B/s] 

2020-04-21 08:55:12,738 copying /tmp/tmpehremsz6 to cache at /root/.flair/embeddings/de-wiki-fasttext-300d-1M
2020-04-21 08:55:12,823 removing temp file /tmp/tmpehremsz6



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


2020-04-21 08:55:19,803 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-stefan-it/lm-historic-wiener-zeitung-forward-v0.1.pt not found in cache, downloading to /tmp/tmpn1othbny


100%|██████████| 71605380/71605380 [00:01<00:00, 39296596.61B/s]

2020-04-21 08:55:21,788 copying /tmp/tmpn1othbny to cache at /root/.flair/embeddings/lm-historic-wiener-zeitung-forward-v0.1.pt
2020-04-21 08:55:21,851 removing temp file /tmp/tmpn1othbny





2020-04-21 08:55:40,726 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-stefan-it/lm-historic-wiener-zeitung-backward-v0.1.pt not found in cache, downloading to /tmp/tmpw5_syvgz


100%|██████████| 71605376/71605376 [00:01<00:00, 40011142.01B/s]

2020-04-21 08:55:42,693 copying /tmp/tmpw5_syvgz to cache at /root/.flair/embeddings/lm-historic-wiener-zeitung-backward-v0.1.pt





2020-04-21 08:55:42,899 removing temp file /tmp/tmpw5_syvgz


### ~ Word (fasttext) + Pooled Flair Embeddings
#### Best known configuration

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         PooledFlairEmbeddings('german-forward'),
                                         PooledFlairEmbeddings('german-backward'),                                                   
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


2020-04-21 16:04:26,417 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-forward-v0.2rc.pt not found in cache, downloading to /tmp/tmp7tsfxq1d


100%|██████████| 72818995/72818995 [00:01<00:00, 53299890.48B/s]

2020-04-21 16:04:27,957 copying /tmp/tmp7tsfxq1d to cache at /root/.flair/embeddings/lm-mix-german-forward-v0.2rc.pt





2020-04-21 16:04:28,024 removing temp file /tmp/tmp7tsfxq1d
2020-04-21 16:04:46,213 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-backward-v0.2rc.pt not found in cache, downloading to /tmp/tmphbu4ir5t


100%|██████████| 72818995/72818995 [00:01<00:00, 65624749.98B/s]

2020-04-21 16:04:47,501 copying /tmp/tmphbu4ir5t to cache at /root/.flair/embeddings/lm-mix-german-backward-v0.2rc.pt





2020-04-21 16:04:47,604 removing temp file /tmp/tmphbu4ir5t


### Word (fasttext) + CharacterEmbeddings + Flair Embeddings



In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),
                                         CharacterEmbeddings(),
                                         FlairEmbeddings('de-forward'),
                                         FlairEmbeddings('de-backward'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Word (fasttext) + CharacterEmbeddings + Pooled Flair Embeddings 




In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),
                                         CharacterEmbeddings(),
                                         PooledFlairEmbeddings('german-forward'),
                                         PooledFlairEmbeddings('german-backward'), 
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### PooledFlair Embeddings + dbmdz-BERT-german-cased Embeddings


In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         PooledFlairEmbeddings('german-forward'),
                                         PooledFlairEmbeddings('german-backward'),
                                          
                                         BertEmbeddings('bert-base-german-dbmdz-cased')
                                                                                             
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### ~ Word (fasttext) + dbmdz-BERT-german-cased Embeddings


In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-german-dbmdz-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + dbmdz-BERT-german-uncased Embeddings


In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-german-dbmdz-uncased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### ~ Word (fasttext) + BERT-base-cased Embeddings

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + BERT-base-uncased Embeddings

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-uncased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### ~ Word (fasttext) + BERT-multilingual-cased Embeddings

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-multilingual-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + BERT-multilingual-uncased Embeddings

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-multilingual-uncased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### ~ Word (fasttext) + BERT-base-german-cased


In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('bert-base-german-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### ~ Word (fasttext) + distilbert-base-german-cased

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('distilbert-base-german-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fastext) + distilbert-base-multilingual-cased

In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         BertEmbeddings('distilbert-base-multilingual-cased'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### One Hot Embedding

In [None]:
# load corpus
corpus = "Own Corpus"

# embed NER tags
embeddings = OneHotEmbeddings(corpus=corpus, field='ner')

### Word (fasttext) + XLM (german + english)
#### XLM English-German model trained on the concatenation of English and German wikipedia < 6-layer, 1024-hidden, 8-heads >


In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         XLMEmbeddings('xlm-mlm-ende-1024'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

### Word (fasttext) + XLM (german + english)
#### XLM English-German model trained with CLM (Causal Language Modeling) on the concatenation of English and German wikipedia < 6-layer, 1024-hidden, 8-heads >


In [None]:
# initialize embeddings 
embedding_types: List[TokenEmbeddings] = [
                                         WordEmbeddings('de'),

                                         XLMEmbeddings('xlm-clm-ende-1024'),
                                                                                            
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

## 5. Set the remaining params and training

In [None]:
# initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
# initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)


In [None]:
# start training 
trainer.train('resources/taggers/example-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150,
              checkpoint=True, 
              embeddings_storage_mode='gpu')

In [None]:
# restart training at any time
from pathlib import Path

checkpoint = '/content/drive/My Drive/resources/checkpoint.pt'
trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)
trainer.train('resources/taggers/example-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              train_with_dev='True',
              max_epochs=?,
              checkpoint=True, 
              embeddings_storage_mode='gpu')

In [None]:
# plot weight traces 
plotter = Plotter()
plotter.plot_weights('resources/taggers/example-ner/weights.txt')