In [None]:
!pip install numpy pandas scikit-surprise sklearn seaborn matplotlib spacy nltk navec slovnet natasha

Collecting natasha
[?25l  Downloading https://files.pythonhosted.org/packages/51/8e/ab0745100be276750fb6b8858c6180a1756696572295a74eb5aea77f3bbd/natasha-1.4.0-py3-none-any.whl (34.4MB)
[K     |████████████████████████████████| 34.4MB 109kB/s 
Collecting ipymarkup>=0.8.0
  Downloading https://files.pythonhosted.org/packages/bf/9b/bf54c98d50735a4a7c84c71e92c5361730c878ebfe903d2c2d196ef66055/ipymarkup-0.9.0-py3-none-any.whl
Collecting yargy>=0.14.0
[?25l  Downloading https://files.pythonhosted.org/packages/d3/46/bc1a17200a55f4b0608f39ac64f1840fd4a52f9eeea462d9afecbf71246b/yargy-0.15.0-py3-none-any.whl (41kB)
[K     |████████████████████████████████| 51kB 5.6MB/s 
[?25hCollecting pymorphy2
[?25l  Downloading https://files.pythonhosted.org/packages/07/57/b2ff2fae3376d4f3c697b9886b64a54b476e1a332c67eee9f88e7f1ae8c9/pymorphy2-0.9.1-py3-none-any.whl (55kB)
[K     |████████████████████████████████| 61kB 6.4MB/s 
Collecting intervaltree>=3
  Downloading https://files.pythonhosted.org/pack

# **Токенизация. NLTK**

In [None]:
import nltk
nltk.download('punkt')
text1 = 'Предобработка данных в XML файле.'
text2 = 'Меня зовут Бонд. Джеймс Бонд'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from nltk import tokenize
dir(tokenize)[:18]

['BlanklineTokenizer',
 'LineTokenizer',
 'MWETokenizer',
 'PunktSentenceTokenizer',
 'RegexpTokenizer',
 'ReppTokenizer',
 'SExprTokenizer',
 'SpaceTokenizer',
 'StanfordSegmenter',
 'TabTokenizer',
 'TextTilingTokenizer',
 'ToktokTokenizer',
 'TreebankWordTokenizer',
 'TweetTokenizer',
 'WhitespaceTokenizer',
 'WordPunctTokenizer',
 '__builtins__',
 '__cached__']

In [None]:
nltk_tk_1 = nltk.WordPunctTokenizer()
nltk_word = nltk_tk_1.tokenize(text1)
print(nltk_word)

['Предобработка', 'данных', 'в', 'XML', 'файле', '.']


In [None]:
# Токенизация по предложениям
nltk_tk_sents = nltk.tokenize.sent_tokenize(text1)
print(len(nltk_tk_sents))
nltk_tk_sents

1


['Предобработка данных в XML файле.']

# **Частеречная разметка. Natasha**

In [None]:
from navec import Navec
from slovnet import Morph

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
navec = Navec.load('/content/gdrive/My Drive/MMO/navec_news_v1_1B_250K_300d_100q.tar')
n_morph = Morph.load('/content/gdrive/My Drive/MMO/slovnet_morph_news_v1.tar', batch_size=4)

In [None]:
morph_res = n_morph.navec(navec)

In [None]:
def print_pos(markup):
    for token in markup.tokens:
        print('{} - {}'.format(token.text, token.tag))

In [None]:
n_text1_markup = list(_ for _ in n_morph.map(nltk_tk_sents))
[print_pos(x) for x in n_text1_markup]

П - PROPN|Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing
р - NOUN
е - X|Foreign=Yes
д - NOUN
о - X|Foreign=Yes
б - NOUN|Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing
р - X|Foreign=Yes
а - CCONJ
б - PROPN
о - NOUN|Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing
т - PRON|Animacy=Inan|Case=Loc|Gender=Neut|Number=Sing
к - ADP
а - X|Foreign=Yes
  - PUNCT
д - NOUN|Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing
а - CCONJ
н - X|Foreign=Yes
н - X|Foreign=Yes
ы - X|Foreign=Yes
х - X|Foreign=Yes
  - PUNCT
в - X|Foreign=Yes
  - PUNCT
X - X|Foreign=Yes
M - PROPN|Foreign=Yes
L - X|Foreign=Yes
  - PUNCT
ф - X|Foreign=Yes
а - CCONJ
й - ADJ|Case=Nom|Degree=Pos|Gender=Masc|Number=Sing
л - X|Foreign=Yes
е - NOUN|Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing
. - PUNCT


[None]

# **Лемматизация. Natasha**

In [None]:
from natasha import Doc, Segmenter, NewsEmbedding, NewsMorphTagger, MorphVocab

In [None]:
def n_lemmatize(text):
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    segmenter = Segmenter()
    morph_vocab = MorphVocab()
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    return doc

In [None]:
n_doc1 = n_lemmatize(text1)
{_.text: _.lemma for _ in n_doc1.tokens}

{'.': '.',
 'XML': 'xml',
 'Предобработка': 'предобработка',
 'в': 'в',
 'данных': 'данные',
 'файле': 'файл'}

In [None]:
n_doc2 = n_lemmatize(text2)
{_.text: _.lemma for _ in n_doc2.tokens}

{'.': '.', 'Бонд': 'бонд', 'Джеймс': 'джеймс', 'Меня': 'я', 'зовут': 'звать'}

# **Выделение (распознавание) именованных сущностей. Natasha**

In [None]:
from slovnet import NER
from ipymarkup import show_span_ascii_markup as show_markup

In [None]:
ner = NER.load('/content/gdrive/My Drive/MMO/slovnet_ner_news_v1.tar')

In [None]:
ner_res = ner.navec(navec)

In [None]:
markup_ner2 = ner(text2)

In [None]:
markup_ner2

SpanMarkup(
    text='Меня зовут Бонд. Джеймс Бонд',
    spans=[Span(
         start=11,
         stop=15,
         type='PER'
     ), Span(
         start=17,
         stop=28,
         type='PER'
     )]
)

In [None]:
show_markup(markup_ner2.text, markup_ner2.spans)

Меня зовут Бонд. Джеймс Бонд
           PER─  PER────────


# **Разбор предложения. Natasha**

In [None]:
from natasha import NewsSyntaxParser

In [None]:
emb = NewsEmbedding()
syntax_parser = NewsSyntaxParser(emb)

In [None]:
n_doc1.parse_syntax(syntax_parser)
n_doc1.sents[0].syntax.print()

┌► Предобработка amod
└─ данных        
┌► в             case
└─ XML           
   файле         
   .             


In [None]:
n_doc2.parse_syntax(syntax_parser)
n_doc2.sents[0].syntax.print()

    ┌► Меня  obj
┌─┌─└─ зовут 
│ └──► Бонд  xcomp
└────► .     punct
