In [34]:
import os
import sys


from dataclasses import dataclass, field
from typing import Optional

from transformers import HfArgumentParser

from flair.datasets import CONLL_03
from flair.embeddings import TransformerWordEmbeddings, TransformerDocumentEmbeddings
from flair.models import SequenceTagger, TextClassifier
from flair.trainers import ModelTrainer


@dataclass
class RunArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """
    dataset_name: str = field(
        metadata={"help": "The path of the dataset to use (via the datasets library)."}
    )
    model_name_or_path: str = field()

    output_path: str = field()
    



# Classification

In [None]:
run_args = RunArguments(
    dataset_name='GERMEVAL_2018_OFFENSIVE_LANGUAGE',
    #model_name_or_path='/netscratch/mostendorff/datasets/huggingface_transformers/pytorch/gpt2',
    model_name_or_path='/netscratch/mostendorff/datasets/huggingface_transformers/pytorch/gpt2-wechsel-german',
    
    output_path='./data/gpt2_conll_03',
)


In [35]:
package = "flair.datasets"
name = run_args.dataset_name

ds_class = getattr(__import__(package, fromlist=[name]), name)


# 1. get the corpus
corpus = ds_class()
print(corpus)


2022-11-27 20:20:11,146 Reading data from /netscratch/mostendorff/datasets/flair_cache/datasets/germeval_2018_offensive_language/coarse_grained
2022-11-27 20:20:11,147 Train: /netscratch/mostendorff/datasets/flair_cache/datasets/germeval_2018_offensive_language/coarse_grained/train.txt
2022-11-27 20:20:11,148 Dev: None
2022-11-27 20:20:11,148 Test: /netscratch/mostendorff/datasets/flair_cache/datasets/germeval_2018_offensive_language/coarse_grained/test.txt
2022-11-27 20:20:14,387 Initialized corpus /netscratch/mostendorff/datasets/flair_cache/datasets/germeval_2018_offensive_language/coarse_grained (label type name is 'class')
Corpus: 4508 train + 501 dev + 3532 test sentences


In [37]:
# 2. what label do we want to predict?
label_type = 'class'

# 3. create the label dictionary
label_dict = corpus.make_label_dictionary(label_type=label_type)

2022-11-27 20:20:24,421 Computing label dictionary. Progress:


4508it [00:00, 69208.83it/s]

2022-11-27 20:20:24,494 Dictionary created for label 'class' with 3 values: OTHER (seen 2991 times), OFFENSE (seen 1517 times)





In [12]:
# 4. initialize transformer document embeddings (many models are available)
#document_embeddings = TransformerDocumentEmbeddings(fine_tune=False)

2022-11-27 20:01:11,672 Could not determine the begin offset of the tokenizer for transformer model transformer-/netscratch/mostendorff/datasets/huggingface_transformers/pytorch/gpt2, assuming 0


In [42]:
from flair.embeddings.base import TransformerEmbedding

document_embeddings = TransformerEmbedding(
    model=run_args.model_name_or_path,
    is_document_embedding=True,
    fine_tune=False,
    cls_pooling='mean',
)
document_embeddings.tokenizer.pad_token = document_embeddings.tokenizer.eos_token

2022-11-27 20:24:34,581 Could not determine the begin offset of the tokenizer for transformer model transformer-/netscratch/mostendorff/datasets/huggingface_transformers/pytorch/gpt2-wechsel-german, assuming 0


In [None]:
# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type)

# 6. initialize trainer
trainer = ModelTrainer(classifier, corpus)

# 7. run training with fine-tuning
fine_tune_res = trainer.fine_tune(run_args.output_path,
      learning_rate=5.0e-5,
      mini_batch_size=16,
      max_epochs=1,
)
fine_tune_res

# NER

In [46]:
run_args = RunArguments(
    dataset_name='NER_GERMAN_GERMEVAL',
    model_name_or_path='/netscratch/mostendorff/datasets/huggingface_transformers/pytorch/gpt2-wechsel-german',
    output_path='./data/gpt2_conll_03',
)




In [48]:
package = "flair.datasets"
name = run_args.dataset_name

ds_class = getattr(__import__(package, fromlist=[name]), name)


# 1. get the corpus
corpus = ds_class()
print(corpus)


Downloading...
From: https://drive.google.com/uc?id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P
To: /netscratch/mostendorff/datasets/flair_cache/datasets/ner_german_germeval/train.tsv
100%|████████████████████████████████████████████████████████████| 7.88M/7.88M [00:00<00:00, 76.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH
To: /netscratch/mostendorff/datasets/flair_cache/datasets/ner_german_germeval/test.tsv
100%|████████████████████████████████████████████████████████████| 1.68M/1.68M [00:00<00:00, 35.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm
To: /netscratch/mostendorff/datasets/flair_cache/datasets/ner_german_germeval/dev.tsv
100%|██████████████████████████████████████████████████████████████| 724k/724k [00:00<00:00, 20.6MB/s]

2022-11-27 20:32:00,639 Reading data from /netscratch/mostendorff/datasets/flair_cache/datasets/ner_german_germeval
2022-11-27 20:32:00,640 Train: /netscratch/mostendorff/datasets/flair_cache/datasets/ner_german_germeval/train.tsv
2022-11-27 20:32:00,640 Dev: /netscratch/mostendorff/datasets/flair_cache/datasets/ner_german_germeval/dev.tsv
2022-11-27 20:32:00,642 Test: /netscratch/mostendorff/datasets/flair_cache/datasets/ner_german_germeval/test.tsv





Corpus: 24000 train + 2200 dev + 5100 test sentences


In [50]:
# 2. what label do we want to predict?
label_type = 'ner'

# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)


2022-11-27 20:32:30,123 Computing label dictionary. Progress:


24000it [00:00, 71766.46it/s]

2022-11-27 20:32:30,467 Dictionary created for label 'ner' with 13 values: LOC (seen 8281 times), PER (seen 7679 times), ORG (seen 5255 times), OTH (seen 3024 times), LOCderiv (seen 2808 times), ORGpart (seen 805 times), LOCpart (seen 513 times), OTHderiv (seen 236 times), OTHpart (seen 190 times), PERpart (seen 184 times), PERderiv (seen 62 times), ORGderiv (seen 41 times)
Dictionary with 13 tags: <unk>, LOC, PER, ORG, OTH, LOCderiv, ORGpart, LOCpart, OTHderiv, OTHpart, PERpart, PERderiv, ORGderiv





In [51]:
# 4. initialize fine-tuneable transformer embeddings WITH document context
embeddings = TransformerWordEmbeddings(model=run_args.model_name_or_path,
                                       layers="-1",
                                       subtoken_pooling="first_last",
                                       fine_tune=True,
                                       use_context=True,
                                       )

2022-11-27 20:33:21,886 Could not determine the begin offset of the tokenizer for transformer model transformer-/netscratch/mostendorff/datasets/huggingface_transformers/pytorch/gpt2-wechsel-german, assuming 0


In [53]:
embeddings.tokenizer.pad_token = embeddings.tokenizer.eos_token

In [54]:

# 5. initialize bare-bones sequence tagger (no CRF, no RNN, no reprojection)
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type='ner',
                        use_crf=False,
                        use_rnn=False,
                        reproject_embeddings=False,
                        )

2022-11-27 20:34:09,462 SequenceTagger predicts: Dictionary with 49 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-ORG, B-ORG, E-ORG, I-ORG, S-OTH, B-OTH, E-OTH, I-OTH, S-LOCderiv, B-LOCderiv, E-LOCderiv, I-LOCderiv, S-ORGpart, B-ORGpart, E-ORGpart, I-ORGpart, S-LOCpart, B-LOCpart, E-LOCpart, I-LOCpart, S-OTHderiv, B-OTHderiv, E-OTHderiv, I-OTHderiv, S-OTHpart, B-OTHpart, E-OTHpart, I-OTHpart, S-PERpart, B-PERpart, E-PERpart, I-PERpart, S-PERderiv, B-PERderiv, E-PERderiv, I-PERderiv, S-ORGderiv, B-ORGderiv, E-ORGderiv, I-ORGderiv


In [55]:

# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

# 7. run fine-tuning
fine_tune_res = trainer.fine_tune(run_args.output_path,
                  learning_rate=5.0e-6,
                  mini_batch_size=4,
                  #mini_batch_chunk_size=1,  # remove this parameter to speed up computation if you have a big GPU
                  )
fine_tune_res

2022-11-27 20:35:23,433 ----------------------------------------------------------------------------------------------------
2022-11-27 20:35:23,435 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0): GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (1): GPT2Block(
        

2022-11-27 20:35:23,440  - patience: "3"
2022-11-27 20:35:23,441  - anneal_factor: "0.5"
2022-11-27 20:35:23,441  - max_epochs: "10"
2022-11-27 20:35:23,442  - shuffle: "True"
2022-11-27 20:35:23,443  - train_with_dev: "False"
2022-11-27 20:35:23,443  - batch_growth_annealing: "False"
2022-11-27 20:35:23,444 ----------------------------------------------------------------------------------------------------
2022-11-27 20:35:23,445 Model training base path: "data/gpt2_conll_03"
2022-11-27 20:35:23,445 ----------------------------------------------------------------------------------------------------
2022-11-27 20:35:23,446 Device: cuda:0
2022-11-27 20:35:23,447 ----------------------------------------------------------------------------------------------------
2022-11-27 20:35:23,447 Embeddings storage mode: none
2022-11-27 20:35:23,448 ----------------------------------------------------------------------------------------------------
2022-11-27 20:36:37,592 --------------------------

100%|█████████████████████████████████████████████████████████████| 1275/1275 [02:24<00:00,  8.80it/s]


2022-11-27 20:39:03,922 Evaluating as a multi-label problem: False
2022-11-27 20:39:04,305 0.0041	0.0474	0.0076	0.0039
2022-11-27 20:39:04,306 
Results:
- F-score (micro) 0.0076
- F-score (macro) 0.0041
- Accuracy 0.0039

By class:
              precision    recall  f1-score   support

         ORG     0.0061    0.2052    0.0119      1150
     ORGpart     0.0010    0.1279    0.0019       172
    LOCderiv     0.0050    0.0517    0.0091       561
         PER     0.0000    0.0000    0.0000      1639
         LOC     0.0000    0.0000    0.0000      1706
         OTH     0.0000    0.0000    0.0000       697
     LOCpart     0.0171    0.0550    0.0261       109
     PERpart     0.0000    0.0000    0.0000        44
     OTHpart     0.0000    0.0000    0.0000        42
    OTHderiv     0.0000    0.0000    0.0000        39
    PERderiv     0.0000    0.0000    0.0000        11
    ORGderiv     0.0000    0.0000    0.0000         8

   micro avg     0.0041    0.0474    0.0076      6178
   macro a

{'test_score': 0.007561192758803112,
 'dev_score_history': [],
 'train_loss_history': [],
 'dev_loss_history': []}