In [2]:
import pymystem3 as pms
import re
import pandas as pd
import multiprocessing as mp
import gensim as gs
import gensim.parsing.preprocessing as gspp
import tqdm
import numpy as np

# Lemmatizer initialization

In [3]:
ms = pms.Mystem(grammar_info=False)
lem = ms.lemmatize

In [4]:
def lemmatize(s):
    s = gspp.strip_non_alphanum(s)
    s = gspp.strip_numeric(s)
    s = gspp.strip_punctuation(s)
    s = gspp.strip_multiple_whitespaces(s)
    s = gspp.strip_short(s, minsize=2)
    s = lem(s)
    s = list(map(str.strip, s))
    s = list(filter(None, s))
    return s

# Dataset

### Import

In [44]:
df = pd.read_csv('dataset/lenta.csv.gz')
df.head()

Unnamed: 0,url,title,text,topic,tags,date
0,https://lenta.ru/news/2014/01/13/kylie/,Кайли Миноуг анонсировала новый альбом,Кайли Миноуг выпустит новый альбом. Об этом со...,Культура,Музыка,2014/01/13
1,https://lenta.ru/news/2008/10/08/westbank/,Израиль заблокирует Западный берег на время пр...,"В среду, 8 октября, в связи с празднованием Дн...",Мир,Все,2008/10/08
2,https://lenta.ru/news/2006/12/07/motorazr/,Motorola судится с создателями сайта motorazr.com,Представители одной из крупнейших в мире компа...,Интернет и СМИ,Все,2006/12/07
3,https://lenta.ru/news/2010/05/22/noexcuse/,Жириновский опроверг сообщения о попытке прими...,Председатель ЛДПР Владимир Жириновский опровер...,Россия,Все,2010/05/22
4,https://lenta.ru/news/2007/02/05/proc/,Читинская милиция окружила прокуратуру в ожида...,"В Чите, где в понедельник должны предъявить об...",Россия,Все,2007/02/05


### Variables & settings

In [6]:
# Raw and lemmatized texts columns
texts_col = 'text'
texts_col_lem = 'text_lem'

size = df[texts_col].size
chunk = 1000
cycles = (size // chunk) + 1

# Lemmatization

In [7]:
if texts_col_lem not in df:
    df[texts_col_lem] = ""

In [8]:
for i in tqdm.tnrange(cycles):
    texts = df.loc[df.index[i * chunk:], texts_col].values.tolist()
    
    with mp.Pool() as pool:
        result = pool.map(lemmatize, texts)
        pool.close()
        pool.join()
        
    df.loc[df.index[i * chunk:], texts_col_lem] = result

  for i in tqdm.tnrange(cycles):


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




### Filtering empty lemmatized documents

In [9]:
df = df[df[texts_col_lem].map(len) > 0]

# Words frequencies
### Creating dictionary

In [9]:
texts_lem = df[texts_col_lem].values.tolist()
corp_dict = gs.corpora.Dictionary(texts_lem)

### Words frequencies calculation

In [10]:
words_freqs = {}
for w, wid in corp_dict.token2id.items():
    words_freqs.update({w: corp_dict.cfs.get(wid)})

### Exporting words frequencies

In [11]:
df_words_freqs = pd.DataFrame(words_freqs, index=[0]).T.reset_index().rename(columns={'index': 'word', 0: 'count'}).sort_values('count', ascending=False)
df_words_freqs.to_excel('results/word_freqs.xlsx', index=False)

# Stop words filtering

### Loading stop words

In [12]:
df_stop_words = pd.read_csv('dataset/stop-words.csv')

### Filtering

In [13]:
less_freq_words = [corp_dict.token2id[w] for w, count in words_freqs.items() if count < 4]
stop_words = [corp_dict.token2id[w] for w in df_stop_words['word'].values.tolist()]

In [14]:
corp_dict.filter_tokens(stop_words + less_freq_words)
corp_dict.compactify()

In [15]:
with mp.Pool() as pool:
    corpus = pool.map(corp_dict.doc2bow, texts_lem)
    pool.close()
    pool.join()

# LDA

In [16]:
num_workers = 4
num_topics = 50

In [17]:
lda = gs.models.ldamulticore.LdaMulticore(corpus, num_topics=num_topics, id2word=corp_dict, workers=num_workers, iterations=200)

## Words vs topics matrix

In [18]:
topics = lda.show_topics(formatted=False)
topics_dfs = [pd.DataFrame(t[1], columns=['words', t[0]]) for t in topics]

In [19]:
df_wt = topics_dfs[0]
for tdf in topics_dfs[1:]:
    df_wt = pd.merge(df_wt, tdf, how='outer', left_on='words', right_on='words')

In [20]:
df_wt.fillna(0).to_excel('results/words-vs-topics-matrix.xlsx')

## Docs vs topics matrix

### Creating matrix

In [21]:
docs_topics = list(map(lda.get_document_topics, corpus))

In [22]:
df_docs_topics = pd.DataFrame(np.zeros((len(docs_topics), num_topics + 1)), columns=['Documents'] + np.arange(num_topics).tolist())
df_docs_topics.iloc[:, 0] = ""
df_docs_topics.head()

Unnamed: 0,Documents,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
for doc_id, doc_topic in enumerate(tqdm.notebook.tqdm(docs_topics)):
    for topic_id, topic_prob in doc_topic:
        df_docs_topics.loc[doc_id, topic_id] = topic_prob

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




### Adding texts

In [40]:
df_docs_topics['Documents'] = df[texts_col].values
df_docs_topics.head()

Unnamed: 0,Documents,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,Кайли Миноуг выпустит новый альбом. Об этом со...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"В среду, 8 октября, в связи с празднованием Дн...",0.0,0.709019,0.0,0.02533,0.0,0.0,0.0,0.0,0.024413,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Представители одной из крупнейших в мире компа...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Председатель ЛДПР Владимир Жириновский опровер...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06987,...,0.49906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"В Чите, где в понедельник должны предъявить об...",0.0,0.0,0.0,0.226584,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Exporting matrix

In [39]:
df_docs_topics.to_excel('results/docs-vs-topics-matrix.xlsx')