# Tutorial 1: NLP Base Types

In [4]:
# import des Flair Packages
from flair.data import Sentence 
from flair.models import SequenceTagger

## Creating a Sentence

In [5]:
# The sentence objects holds a sentence that we may want to embed or tag
from flair.data import Sentence

# Make a sentence object by passing a whitespace tokenized string
sentence = Sentence('The grass is green .')

# Print the object to see what's in there
print(sentence)

Sentence: "The grass is green ." - 5 Tokens


In [6]:
# using the token id
print(sentence.get_token(3))
# using the index itself 
print(sentence[3])

Token: 3 is
Token: 4 green


In [7]:
for token in sentence:
    print(token)

Token: 1 The
Token: 2 grass
Token: 3 is
Token: 4 green
Token: 5 .


## Tokenization

In [8]:
# Make a sentence object by passing an untokenized string and the 'use_tokenizer' flag
sentence = Sentence('The grass is green.', use_tokenizer=True)

# Print the object to see what's in there
print(sentence)

Sentence: "The grass is green ." - 5 Tokens


## Adding Tags to Tokens

In [9]:
# add a tag to a word in the sentence
sentence[3].add_tag('ner', 'color')

# print the sentence with all tags of this type
print(sentence.to_tagged_string())

The grass is green <color> .


In [10]:
from flair.data import Label

tag: Label = sentence[3].get_tag('ner')

print(f'"{sentence[3]}" is tagged as "{tag.value}" with confidence score "{tag.score}"')

"Token: 4 green" is tagged as "color" with confidence score "1.0"


## Adding Labels to Sentences

In [11]:
sentence = Sentence('France is the current world cup winner.')

# add a label to a sentence
sentence.add_label('sports')

# a sentence can also belong to multiple classes
sentence.add_labels(['sports', 'world cup'])

#you can also set the labels while initalizing the sentence
sentence = Sentence('France is the current world cup winner.', labels=['sports', 'world cup'])

In [12]:
sentence = Sentence('France is the current world cup winner.', labels=['sports', 'world cup'])

print(sentence)
for label in sentence.labels:
    print(label)

Sentence: "France is the current world cup winner." - 7 Tokens
sports (1.0)
world cup (1.0)


# Tutorial 2: Tagging your text

In [13]:
from flair.models import SequenceTagger

tagger = SequenceTagger.load('ner')

In [17]:
sentence = Sentence('George Washington went to Washington .')

#predict NER tags
tagger.predict(sentence)

#print sentence with predicted tags
print(sentence.to_tagged_string())

George <B-PER> Washington <E-PER> went to Washington <S-LOC> .


### Relevant für Ausgabe der erkannten Tokens für den späteren Austausch

In [20]:
for entity in sentence.get_spans('ner'):
    print(entity)

PER-span [1,2]: "George Washington"
LOC-span [5]: "Washington"


Jede Spanne hat seine eigenen Text der sich so ausgeben lässt:

In [21]:
print(sentence.to_dict(tag_type='ner'))

{'text': 'George Washington went to Washington .', 'labels': [], 'entities': [{'text': 'George Washington', 'start_pos': 0, 'end_pos': 17, 'type': 'PER', 'confidence': 0.9884235858917236}, {'text': 'Washington', 'start_pos': 26, 'end_pos': 36, 'type': 'LOC', 'confidence': 0.9521980881690979}]}


Dies gibt zusätzlich den Text, die gefundenen Entities, Labels und eine Confidence Score aus, die aussagt wie sicher sich der Tagger bei diesem Tag ist.

## Deutsche Sätze taggen

In [23]:
# load model
tagger = SequenceTagger.load('de-ner')

# make German sentence
sentence = Sentence('George Washington ging nach Washington .')

# predict NER tags
tagger.predict(sentence)

# print sentence with predicted tags
print(sentence.to_tagged_string())

George <B-PER> Washington <E-PER> ging nach Washington <S-LOC> .


# Wichtig für uns - PoS Tagging!
Dafür laden wir das deutsche PoS Model: 'de-pos'

In [None]:
# load model
tagger = SequenceTagger.load('de-pos')

# make German sentence
sentence = Sentence('George Washington ging nach Washington .')

# predict NER tags
tagger.predict(sentence)

# print sentence with predicted tags
print(sentence.to_tagged_string())

2019-01-11 14:28:00,482 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.2/UPOS-udgerman--h256-l1-b8-%2Bgerman-forward%2Bgerman-backward--v0.2/de-pos-ud-v0.2.pt not found in cache, downloading to /var/folders/_1/sr6gl6wj7g9_111_kbc1d_fr0000gn/T/tmpd7tuqbe8


 87%|████████▋ | 215371776/248505449 [07:44<01:47, 307457.17B/s]