In [None]:
# Загрузка датасета из репозитория гит
!git clone https://github.com/nikitosl/spbu-nlp-2020.git
import sys
sys.path.append('./spbu-nlp-2020')

In [None]:
# Импорты
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from string import punctuation
import pandas as pd
import numpy as np
from collections import Counter

In [None]:
nltk.download('punkt')
nltk.download("stopwords")

# # Для лемматизации

# !pip install stanza
# !pip install spacy_stanza
# !pip install pymorphy2==0.8
# stanza.download('ru') 
# import stanza
# from spacy_stanza import StanzaLanguage

### Чтение файла Война и мир

In [None]:
with open('./spbu-nlp-2020/text_preprocessing/war_and_peace.txt', 'r', encoding='cp1251') as f:
    text = f.read()

In [None]:
text

### Токенизация

#### Токенизация с помощью регулярных выражений

In [None]:
re_tokenized_text = re.findall(r'\w+', text.lower())

In [None]:
re_tokenized_text

#### Частоты слов после токенизации RE

In [None]:
cntr = Counter(re_tokenized_text)

In [None]:
cntr_df = pd.DataFrame(cntr.items(), columns=['Word', 'Number']) \
    .sort_values(by='Number', ascending=False)

In [None]:
cntr_df.head(30)

#### Токенизация с помощью NLTK

In [None]:
nltk_tokenized_text = nltk.word_tokenize(text.lower())

In [None]:
nltk_tokenized_text

#### Частоты слов после токенизации NLTK

In [None]:
cntr = Counter(nltk_tokenized_text)

In [None]:
cntr_df = pd.DataFrame(cntr.items(), columns=['Word', 'Number']) \
    .sort_values(by='Number', ascending=False)

In [None]:
cntr_df.head(30)

### Удалениие стоп-слов

#### Удалениие стоп-слов

In [None]:
russian_stopwords = stopwords.words("russian")

In [None]:
punctuation = punctuation + '–»«`’'
clear_tokenized_text = [token for token in nltk_tokenized_text \
                          if token not in russian_stopwords and token not in punctuation]

In [None]:
clear_tokenized_text

#### Частоты слов после удаления стоп-слов

In [None]:
cntr = Counter(clear_tokenized_text)

In [None]:
cntr_df = pd.DataFrame(cntr.items(), columns=['Word', 'Number']) \
    .sort_values(by='Number', ascending=False)

In [None]:
cntr_df.head(30)

### Стемминг

#### Стемминг

In [None]:
stemmer = SnowballStemmer("russian")

stemm_text = [stemmer.stem(token) for token in clear_tokenized_text]

In [None]:
stemm_text

#### Частоты слов после стемминга

In [None]:
cntr = Counter(stemm_text)

In [None]:
cntr_df = pd.DataFrame(cntr.items(), columns=['Word', 'Number']) \
    .sort_values(by='Number', ascending=False)

In [None]:
cntr_df.head(30)

### Лемматизация

#### Лемматизация (может занять много времени)

In [None]:
%%time
snlp = stanza.Pipeline(lang="ru")
nlp = StanzaLanguage(snlp)

# Разбиваем на два куска, так как максимальная длина входа 1000000 символов
gap = len(nltk_tokenized_text) // 2
subtext1 = ' '.join(nltk_tokenized_text[:gap])
subtext2 = ' '.join(nltk_tokenized_text[gap:])

In [None]:
doc1 = nlp(subtext1)
doc2 = nlp(subtext2)

lemm_text = [token.lemma_ for token in doc1] + [token.lemma_ for token in doc2]

In [None]:
lemm_text

#### Частоты слов после лемматизации

In [None]:
cntr = Counter(lemm_text)

In [None]:
cntr_df = pd.DataFrame(cntr.items(), columns=['Word', 'Number']) \
    .sort_values(by='Number', ascending=False)

In [None]:
cntr_df.head(30)

### Мешок слов

In [None]:
parts_number = 5
part_size = len(stemm_text) // parts_number
text_parts = [stemm_text[i:i+part_size] for i in range(0, len(stemm_text), part_size)][:parts_number]
unique_words = Counter(stemm_text).keys()
print(f'Всего уникальных слов в тексте: {len(unique_words)}')
word2num = dict([(word, num) for num, word in enumerate(unique_words)])

In [None]:
def get_bag_of_words(text):
    res = np.zeros(len(word2num))
    for word in text:
        res[word2num[word]] += 1
    return res
bags_of_words = [get_bag_of_words(part) for part in text_parts]

In [None]:
bow_df = pd.DataFrame([*bags_of_words], columns=word2num.keys(), index=range(1, parts_number + 1))

In [None]:
bow_df

### TF-IDF

In [None]:
tf = bow_df / part_size # Важность токена в документе
idf = np.log(parts_number / (bow_df != 0).sum(axis=0)) # Важность токена по всем документам
tf_idf = tf * idf

In [None]:
tf_idf

In [None]:
important_features = tf_idf.loc[:, (tf_idf.sum(axis=0) > 0.002)]
(important_features >= important_features.max(axis=0) / 2).astype(int)