### Обработка текста по модулю spacy

In [1]:
import spacy # подключим библиотеку

# Загрузим NLP-модель для английского языка
nlp = spacy.load('en_core_web_lg') # en_core_web_lg это название модели, которая была скачена и установлена

# Текст для анализа. Можете написать свой текст (на английском)
text = """London is the capital and most populous city of England and 
the United Kingdom.  Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia. It was founded by the Romans, who named it Londinium.
"""

# Парсинг текста с помощью spaCy. Эта команда запускает целый конвейер по обработке текста
doc = nlp(text)

# в переменной 'doc' теперь содержится обработанная версия текста
# мы можем делать с ней все что угодно!
# например, распечатать все обнаруженные именованные сущности (в .ents)
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")# печатаем слово .text (словосочетание) и его тип .label_

ModuleNotFoundError: No module named 'spacy'

### Скачиваем библиотеку NLTK

In [3]:
import nltk
nltk.download('punkt')
#import nltk.data

[nltk_data] Downloading package punkt to C:\Users\Ivan
[nltk_data]     Maltsev\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Токенизация по предложения

In [4]:
text = "We try to implement NLTK.Sent_tokenize. It is very hard to produce good tokens. Our approach is model-based one! And they has already train a good model for tokenizing. Really? Yes ... try. Hard words: vice president, half sister"
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
    print(sentence)
    print()

We try to implement NLTK.Sent_tokenize.

It is very hard to produce good tokens.

Our approach is model-based one!

And they has already train a good model for tokenizing.

Really?

Yes ... try.

Hard words: vice president, half sister



### Токенизация по словам

In [5]:
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    print(words)
    print()

['We', 'try', 'to', 'implement', 'NLTK.Sent_tokenize', '.']

['It', 'is', 'very', 'hard', 'to', 'produce', 'good', 'tokens', '.']

['Our', 'approach', 'is', 'model-based', 'one', '!']

['And', 'they', 'has', 'already', 'train', 'a', 'good', 'model', 'for', 'tokenizing', '.']

['Really', '?']

['Yes', '...', 'try', '.']

['Hard', 'words', ':', 'vice', 'president', ',', 'half', 'sister']



### Лематизация и стеминг текста

In [6]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import wordnet

def compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word, pos):
    """
    Print the results of stemming and lemmitization using the passed stemmer, lemmatizer, word and pos (part of speech)
    """
    print("Stemmer:", stemmer.stem(word))
    print("Lemmatizer:", lemmatizer.lemmatize(word, pos))
    print()

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "seeking", pos = wordnet.VERB)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "drove", pos = wordnet.VERB)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "meeting", pos = wordnet.NOUN)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "meeting", pos = wordnet.VERB)

[nltk_data] Downloading package wordnet to C:\Users\Ivan
[nltk_data]     Maltsev\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


Stemmer: seek
Lemmatizer: seek

Stemmer: drove
Lemmatizer: drive

Stemmer: meet
Lemmatizer: meeting

Stemmer: meet
Lemmatizer: meet



### Стоп-слова

In [7]:
nltk.download("stopwords")
from nltk.corpus import stopwords
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to C:\Users\Ivan
[nltk_data]     Maltsev\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Убираем стоп-слова

In [8]:
stop_words = set(stopwords.words("english")) # получим стоп-слова, превратим их в множество с помощью set()
sentence = "Backgammon is one of the oldest known board games." # зададим строку

words = nltk.word_tokenize(sentence) # токенизируем ее по словам

# и будем в цикле перебирать все слова из words, проверять входит ли оно в множество стоп-слов stop_words,
# и если нет, то вернем слово word, иначе ничего не вернем. 
without_stop_words = [word for word in words if not word in stop_words] 
print(without_stop_words) #

['Backgammon', 'one', 'oldest', 'known', 'board', 'games', '.']


### Чистка текста

In [10]:
text

'We try to implement NLTK.Sent_tokenize. It is very hard to produce good tokens. Our approach is model-based one! And they has already train a good model for tokenizing. Really? Yes ... try. Hard words: vice president, half sister'

In [None]:
text.replace('это', '') # заменяем все подстроки 'это'  на пробел.
text = text.replace('это', '') # если хотим поменять - надо переприсвоить ей значение.