<a href="https://colab.research.google.com/github/lovgon/stepik-fast-start_to_AI/blob/main/3_1_NLP_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Предобработка текста

In [1]:
import nltk
import spacy
import re

### Токенизация

In [2]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
data = "All work and no play makes jack a dull boy, all work and no play"
tokens = word_tokenize(data.lower())
print(tokens)

['all', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']


### Удаление неинформативных слов

#### N-граммы

<img src="https://res.cloudinary.com/practicaldev/image/fetch/s--466CQV1q--/c_limit%2Cf_auto%2Cfl_progressive%2Cq_66%2Cw_880/https://thepracticaldev.s3.amazonaws.com/i/78nf1vryed8h1tz05fim.gif" height=400>

In [4]:
unigram = list(nltk.ngrams(tokens, 1))
bigram = list(nltk.ngrams(tokens, 2))
print(unigram[:5])
print(bigram[:5])

[('all',), ('work',), ('and',), ('no',), ('play',)]
[('all', 'work'), ('work', 'and'), ('and', 'no'), ('no', 'play'), ('play', 'makes')]


In [5]:
from nltk import FreqDist
print('Популярные униграммы: ', FreqDist(unigram).most_common(5))
print('Популярные биграммы: ', FreqDist(bigram).most_common(5))

Популярные униграммы:  [(('all',), 2), (('work',), 2), (('and',), 2), (('no',), 2), (('play',), 2)]
Популярные биграммы:  [(('all', 'work'), 2), (('work', 'and'), 2), (('and', 'no'), 2), (('no', 'play'), 2), (('play', 'makes'), 1)]


#### Стоп-слова

In [6]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
stopWords = set(stopwords.words('english'))
print(stopWords)

{'with', 'hers', 'does', 'can', 'against', 'during', 'the', 'out', 'didn', 'hasn', 'below', "won't", 'too', "didn't", 'why', 'whom', 'has', 'they', 'while', 'by', 'mustn', 'under', 'shan', "mustn't", 'off', 'be', 'so', 'more', 'if', 'but', 'y', 'no', 'o', 'shouldn', 'my', "shan't", 'them', 'because', 'most', "wouldn't", 'me', 'being', 'don', 'before', "wasn't", 'your', 'wouldn', 'yours', 'there', 't', 'their', 'both', 'were', 'own', 'how', 're', 'should', 'isn', 'in', 'needn', 'you', 'wasn', 'then', 'herself', 'hadn', 'was', 'where', 'and', 'have', "hadn't", 'yourself', 'aren', "that'll", 'doesn', 'this', 'nor', 'which', 'we', 'once', 'i', "you're", 'each', 'weren', 'ain', "she's", 'as', 'd', "haven't", 'himself', 'now', "don't", 'it', 'of', 'over', 'm', 'those', "hasn't", 'he', 'after', 'will', 'haven', 'few', 'down', 'just', 'for', 'she', 'same', 'into', 'had', "doesn't", "weren't", "needn't", 'than', 'themselves', 'to', 'not', 've', 'up', "you'd", "should've", 'ourselves', 'having',

In [8]:
print([word for word in tokens if word not in stopWords])

['work', 'play', 'makes', 'jack', 'dull', 'boy', ',', 'work', 'play']


### Стемминг
* процесс нахождения основы слова для заданного исходного слова

In [9]:
from nltk.stem import PorterStemmer, SnowballStemmer
words = ["game", "gaming", "gamed", "games", "compacted"]
words_ru = ['корова', 'мальчики', 'мужчины', 'столом', 'убежала']

In [10]:
ps = PorterStemmer()
list(map(ps.stem, words))

['game', 'game', 'game', 'game', 'compact']

In [11]:
ss = SnowballStemmer(language='russian')
list(map(ss.stem, words_ru))

['коров', 'мальчик', 'мужчин', 'стол', 'убежа']

### Лематизация
* процесс приведения словоформы к лемме — её нормальной (словарной) форме

In [12]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government.  Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""

In [13]:
nlp = spacy.load('en')
doc = nlp(raw)
print(' '.join([token.lemma_ for token in doc]))

denni : listen , strange woman lie in pond distribute sword 
 be no basis for a system of government .   Supreme executive power derive from 
 a mandate from the masse , not from some farcical aquatic ceremony .


In [14]:
[(token.lemma_, token.pos_) for token in doc[:7]]

[('denni', 'NOUN'),
 (':', 'PUNCT'),
 ('listen', 'VERB'),
 (',', 'PUNCT'),
 ('strange', 'ADJ'),
 ('woman', 'NOUN'),
 ('lie', 'VERB')]

### Поиск шаблонов

#### Регулярные выражения

Исчерпывающий пост https://habr.com/ru/post/349860/

In [15]:
re.findall('\d+', 'There is some numbers: 49 and 432')

['49', '432']

In [16]:
re.sub('[,\.?!]',' ','How, to? split. text!').split()

['How', 'to', 'split', 'text']

In [17]:
re.sub('[^A-z]',' ','I 123 can 45 play 67 football').split()

['I', 'can', 'play', 'football']