In [None]:
import pymystem3 as pms
import re
import pandas as pd
import joblib as jl
import gensim as gs
import gensim.parsing.preprocessing as gspp
import tqdm
import numpy as np
import os

# Lemmatizer initialization

In [None]:
ms = pms.Mystem(grammar_info=False)
lem = ms.lemmatize

In [None]:
def lemmatize(s):
    s = gspp.strip_non_alphanum(s)
    s = gspp.strip_numeric(s)
    s = gspp.strip_punctuation(s)
    s = gspp.strip_multiple_whitespaces(s)
    s = gspp.strip_short(s, minsize=2)
    s = lem(s)
    s = list(map(str.strip, s))
    s = list(filter(None, s))
    return s

# Dataset

### Import

In [None]:
df = pd.read_csv('dataset/lenta.csv.gz')
df.head()

### Variables & settings

In [None]:
# Raw and lemmatized texts columns
texts_col = 'text'
texts_col_lem = 'text_lem'
words_low_freq_lim = 4

# Lemmatization

In [None]:
if texts_col_lem not in df:
    df[texts_col_lem] = ""

In [None]:
lem_results = jl.Parallel(n_jobs=jl.cpu_count(), verbose=1, batch_size=100)(jl.delayed(lemmatize)(df[texts_col].iloc[i]) for i in range(df[texts_col].size))
df[texts_col_lem] = lem_results

### Filtering empty lemmatized documents

In [None]:
df = df[df[texts_col_lem].map(len) > 0]

# Words frequencies
### Creating dictionary

In [None]:
texts_lem = df[texts_col_lem].values.tolist()
corp_dict = gs.corpora.Dictionary(texts_lem)

### Words frequencies calculation

In [None]:
words_freqs = {}
for w, wid in corp_dict.token2id.items():
    words_freqs.update({w: corp_dict.cfs.get(wid)})

### Exporting words frequencies

In [None]:
os.makedirs('results', exist_ok=True)
df_words_freqs = pd.DataFrame(words_freqs, index=[0]).T.reset_index().rename(columns={'index': 'word', 0: 'count'}).sort_values('count', ascending=False)
df_words_freqs.to_excel('results/word_freqs.xlsx', index=False)

# Stop words filtering

### Loading stop words

In [None]:
df_stop_words = pd.read_csv('dataset/stop-words.csv')

### Filtering

In [None]:
less_freq_words = [corp_dict.token2id[w] for w, count in words_freqs.items() if count < words_low_freq_lim]
stop_words = [corp_dict.token2id[w] for w in df_stop_words['word'].values.tolist()]

In [None]:
corp_dict.filter_tokens(stop_words + less_freq_words)
corp_dict.compactify()

In [None]:
with mp.Pool() as pool:
    corpus = pool.map(corp_dict.doc2bow, texts_lem)
    pool.close()
    pool.join()

# LDA

In [None]:
num_workers = 4
num_topics = 100

In [None]:
lda = gs.models.ldamulticore.LdaMulticore(corpus, num_topics=num_topics, id2word=corp_dict, workers=num_workers, iterations=200)

## Words vs topics matrix

In [None]:
topics = lda.show_topics(formatted=False)
topics_dfs = [pd.DataFrame(t[1], columns=['words', t[0]]) for t in topics]

In [None]:
df_wt = topics_dfs[0]
for tdf in topics_dfs[1:]:
    df_wt = pd.merge(df_wt, tdf, how='outer', left_on='words', right_on='words')

In [None]:
df_wt.fillna(0).to_excel('results/words-vs-topics-matrix.xlsx')

## Docs vs topics matrix

### Creating matrix

In [None]:
docs_topics = list(map(lda.get_document_topics, corpus))

In [None]:
df_docs_topics = pd.DataFrame(np.zeros((len(docs_topics), num_topics + 1)), columns=['Documents'] + np.arange(num_topics).tolist())
df_docs_topics.iloc[:, 0] = ""
df_docs_topics.head()

In [None]:
for doc_id, doc_topic in enumerate(tqdm.notebook.tqdm(docs_topics)):
    for topic_id, topic_prob in doc_topic:
        df_docs_topics.loc[doc_id, topic_id] = topic_prob

### Adding texts

In [None]:
df_docs_topics['Documents'] = df[texts_col].values
df_docs_topics.head()

### Exporting matrix

In [None]:
df_docs_topics.to_excel('results/docs-vs-topics-matrix.xlsx')