**Разметка частей речи**

Евгений Борисов <esborisov@sevsu.ru>

библиотека NLTK

Сравнение и создание морфологических анализаторов в NLTK.   https://habr.com/ru/post/340404/

In [1]:
import nltk

# nltk.download('brown')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('averaged_perceptron_tagger_ru')

**Brown Corpus**  
The Brown University Standard Corpus of Present-Day American English (or just Brown Corpus) is an electronic collection of text samples of American English, the first major structured corpus of varied genres. This corpus first set the bar for the scientific study of the frequency and distribution of word categories in everyday language use.


https://en.wikipedia.org/wiki/Brown_Corpus

---

In [2]:
nltk.corpus.brown.tagged_sents(categories='news')

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

In [3]:
training_count = int(len(nltk.corpus.brown.tagged_sents(categories='news')) * .9)
training_count

4160

In [4]:
# учебные размеченные данные
training_sents = nltk.corpus.brown.tagged_sents(categories='news')[:training_count]

# тестовые размеченные данные
testing_sents = nltk.corpus.brown.tagged_sents(categories='news')[training_count+1:]

# тестовые НЕразмеченные данные
testing_sents_notags = nltk.corpus.brown.sents(categories='news')[training_count+1:]

In [5]:
# самый часто используемый тег - NN(noun, имя существительное) 
tags = [tag for (word, tag) in 
        nltk.corpus.brown.tagged_words(categories='news')]
nltk.FreqDist(tags).max()

'NN'

---

In [6]:
# примитивный (вырожденный) тагер - всем присваивает один таг
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(testing_sents_notags[5])

[('Holmes', 'NN'),
 ('went', 'NN'),
 ('to', 'NN'),
 ("Atlanta's", 'NN'),
 ('Morehouse', 'NN'),
 ('(', 'NN'),
 ('Negro', 'NN'),
 (')', 'NN'),
 ('College', 'NN'),
 (',', 'NN'),
 ('where', 'NN'),
 ('he', 'NN'),
 ('is', 'NN'),
 ('a', 'NN'),
 ('B', 'NN'),
 ('student', 'NN'),
 ('and', 'NN'),
 ('star', 'NN'),
 ('halfback', 'NN'),
 ('.', 'NN')]

In [7]:
# доля правильных тагов
default_tagger.accuracy(testing_sents)

0.12639776357827476

---

In [8]:
# тагер на простых правилах
patterns = [
     (r'.*ing$', 'VBG'),               # gerunds
     (r'.*ed$', 'VBD'),                # simple past
     (r'.*es$', 'VBZ'),                # 3rd singular present
     (r'.*ould$', 'MD'),               # modals
     (r'.*\'s$', 'NN$'),               # possessive nouns
     (r'.*s$', 'NNS'),                 # plural nouns
     (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
     (r'.*', 'NN')                     # nouns (default)
]
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(testing_sents_notags[5])

[('Holmes', 'VBZ'),
 ('went', 'NN'),
 ('to', 'NN'),
 ("Atlanta's", 'NN$'),
 ('Morehouse', 'NN'),
 ('(', 'NN'),
 ('Negro', 'NN'),
 (')', 'NN'),
 ('College', 'NN'),
 (',', 'NN'),
 ('where', 'NN'),
 ('he', 'NN'),
 ('is', 'NNS'),
 ('a', 'NN'),
 ('B', 'NN'),
 ('student', 'NN'),
 ('and', 'NN'),
 ('star', 'NN'),
 ('halfback', 'NN'),
 ('.', 'NN')]

In [9]:
# доля правильных тагов
regexp_tagger.accuracy(testing_sents)

0.20467252396166133

---

In [10]:
# комбинации из разных тагеров (backoff)

default_tagger = nltk.DefaultTagger('NN')
unigram_tagger = nltk.UnigramTagger(training_sents, 
                                    backoff=default_tagger)
bigram_tagger = nltk.BigramTagger(training_sents,
                                  backoff=unigram_tagger)

bigram_tagger.tag(testing_sents_notags[5])

[('Holmes', 'NN'),
 ('went', 'VBD'),
 ('to', 'TO'),
 ("Atlanta's", 'NP$'),
 ('Morehouse', 'NN'),
 ('(', '('),
 ('Negro', 'NP'),
 (')', ')'),
 ('College', 'NN-TL'),
 (',', ','),
 ('where', 'WRB'),
 ('he', 'PPS'),
 ('is', 'BEZ'),
 ('a', 'AT'),
 ('B', 'NN'),
 ('student', 'NN'),
 ('and', 'CC'),
 ('star', 'NN'),
 ('halfback', 'NN'),
 ('.', '.')]

In [11]:
bigram_tagger.accuracy(testing_sents) # доля правильных тагов

0.8452476038338658

---

In [12]:
# nltk.download('averaged_perceptron_tagger')

In [13]:
sentence = 'Today morning, Arthur felt very good.'

# предобученный тагер (английский)
nltk.pos_tag(  nltk.word_tokenize(sentence), lang='eng')

[('Today', 'NN'),
 ('morning', 'NN'),
 (',', ','),
 ('Arthur', 'NNP'),
 ('felt', 'VBD'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('.', '.')]

In [16]:
sentence = 'Кроме того, иногда одно и то же слово может иметь несколько разных лемм.'
# предобученный тагер (русский)
nltk.pos_tag(  nltk.word_tokenize(sentence), lang='rus')

[('Кроме', 'PR'),
 ('того', 'S-PRO'),
 (',', 'NONLEX'),
 ('иногда', 'ADV'),
 ('одно', 'A-PRO=n'),
 ('и', 'CONJ'),
 ('то', 'S-PRO'),
 ('же', 'PART'),
 ('слово', 'S'),
 ('может', 'V'),
 ('иметь', 'V'),
 ('несколько', 'NUM=acc'),
 ('разных', 'A=pl'),
 ('лемм', 'S'),
 ('.', 'NONLEX')]

---

---