In [None]:
import tomotopy as to
import pandas as pd
import numpy as np
import tqdm
import joblib as jl
import pymystem3 as pms
import gensim.parsing.preprocessing as gspp
import gensim as gs
import os

# Lemmatizer initialization

In [None]:
ms = pms.Mystem(grammar_info=False)
lem = ms.lemmatize

In [None]:
def lemmatize(s):
    s = gspp.strip_non_alphanum(s)
    s = gspp.strip_numeric(s)
    s = gspp.strip_punctuation(s)
    s = gspp.strip_multiple_whitespaces(s)
    s = gspp.strip_short(s, minsize=2)
    s = lem(s)
    s = list(map(str.strip, s))
    s = list(filter(None, s))
    return s

# Dataset

In [None]:
df = pd.read_csv('dataset/lenta.csv.gz')
df.head()

### Variables & settings

In [None]:
# Raw and lemmatized texts columns
texts_col = 'text'
texts_col_lem = 'text_lem'
words_low_freq_lim = 4

# Lemmatization

In [None]:
if texts_col_lem not in df:
    df[texts_col_lem] = ""

In [None]:
lem_results = jl.Parallel(n_jobs=jl.cpu_count(), verbose=1, batch_size=100)(jl.delayed(lemmatize)(df[texts_col].iloc[i]) for i in range(df[texts_col].size))
df[texts_col_lem] = lem_results

### Filtering empty lemmatized documents

In [None]:
df = df[df[texts_col_lem].map(len) > 0]

# Words frequencies
### Creating dictionary

In [None]:
texts_lem = df[texts_col_lem].tolist()
corp_dict = gs.corpora.Dictionary(texts_lem)

### Words frequencies calculation

In [None]:
words_freqs = {}
for w, wid in corp_dict.token2id.items():
    words_freqs.update({w: corp_dict.cfs.get(wid)})

#low_freq_words = list(filter(None, map(lambda x: x[0] if x[1] < words_low_freq_lim else None, words_freqs.items())))

### Exporting words frequencies

In [None]:
os.makedirs('results', exist_ok=True)
df_words_freqs = pd.DataFrame(words_freqs, index=[0]).T.reset_index().rename(columns={'index': 'word', 0: 'count'}).sort_values('count', ascending=False)
df_words_freqs.to_excel('results/word-freqs.xlsx', index=False)

# Stop words and low-freq words filtering

In [None]:
stopwords = pd.read_csv('dataset/stop-words.csv').word.tolist()
#filter_words = set(stopwords + low_freq_words)

In [None]:
df[texts_col_lem] = df[texts_col_lem].map(lambda x: list(filter(lambda y: y not in stopwords, x)))

# LDA

### Initialization

In [None]:
# Topics number
k = 100

lda = to.LDAModel(k=k, min_cf=words_low_freq_lim, alpha=0.1, eta=0.1)

### Loading documents

In [None]:
for doc in tqdm.tqdm_notebook(df[texts_col_lem]):
    lda.add_doc(doc)

### LDA training

In [None]:
lda.train(iter=200)

# Words vs topics matrix 

### Getting unique words

In [None]:
uniq_words = np.unique(np.concatenate(df[texts_col_lem].values))
words_topics_distr = list(map(lambda x: lda.get_topic_words(x, uniq_words.size), range(k)))

### Creating matrix

In [None]:
df_words_probs = pd.DataFrame(np.zeros((uniq_words.size, k + 1)))
df_words_probs.columns = ['words'] + df_words_probs.columns.to_list()[:-1]
df_words_probs['words'] = uniq_words
df_words_probs.set_index('words', inplace=True)

### Filling our matrix with the probabilities

In [None]:
def process_words_probs(x):
    words = list(map(lambda y: y[0], x))
    probs = list(map(lambda y: y[1], x))
    return words, probs

for i, words_topics_col in enumerate(tqdm.tqdm_notebook(words_topics_distr)):
    words, probs = process_words_probs(words_topics_col)
    df_words_probs.loc[words, i] = probs

### Inspecting the results

In [None]:
df_words_probs.round(2).head()

### Exporting

In [None]:
df_words_probs.to_excel('results/words-vs-topics-matrix.xlsx')

# Documents vs topics matrix

### Creating matrix

In [None]:
docs_vs_topics = list(map(lambda doc: doc.get_topic_dist(), lda.docs))
df_docs_topics = pd.DataFrame(docs_vs_topics)
df_docs_topics.insert(0, 'documents', df[texts_col])

### Inspecting matrix

In [None]:
df_docs_topics.round(2).head()

### Exporting

In [None]:
df_docs_topics.to_excel('results/docs-vs-topics-matrix.xlsx')