In [2]:
import tomotopy as to
import pandas as pd
import numpy as np
import tqdm
import multiprocessing as mp
import pymystem3 as pms
import gensim.parsing.preprocessing as gspp
import gensim as gs

# Lemmatizer initialization

In [3]:
ms = pms.Mystem(grammar_info=False)
lem = ms.lemmatize

In [4]:
def lemmatize(s):
    s = gspp.strip_non_alphanum(s)
    s = gspp.strip_numeric(s)
    s = gspp.strip_punctuation(s)
    s = gspp.strip_multiple_whitespaces(s)
    s = gspp.strip_short(s, minsize=2)
    s = lem(s)
    s = list(map(str.strip, s))
    s = list(filter(None, s))
    return s

# Dataset

In [5]:
df = pd.read_csv('dataset/lenta.csv.gz')
df.head()

Unnamed: 0,url,title,text,topic,tags,date
0,https://lenta.ru/news/2014/01/13/kylie/,Кайли Миноуг анонсировала новый альбом,Кайли Миноуг выпустит новый альбом. Об этом со...,Культура,Музыка,2014/01/13
1,https://lenta.ru/news/2008/10/08/westbank/,Израиль заблокирует Западный берег на время пр...,"В среду, 8 октября, в связи с празднованием Дн...",Мир,Все,2008/10/08
2,https://lenta.ru/news/2006/12/07/motorazr/,Motorola судится с создателями сайта motorazr.com,Представители одной из крупнейших в мире компа...,Интернет и СМИ,Все,2006/12/07
3,https://lenta.ru/news/2010/05/22/noexcuse/,Жириновский опроверг сообщения о попытке прими...,Председатель ЛДПР Владимир Жириновский опровер...,Россия,Все,2010/05/22
4,https://lenta.ru/news/2007/02/05/proc/,Читинская милиция окружила прокуратуру в ожида...,"В Чите, где в понедельник должны предъявить об...",Россия,Все,2007/02/05


### Variables & settings

In [6]:
# Raw and lemmatized texts columns
texts_col = 'text'
texts_col_lem = 'text_lem'

size = df[texts_col].size
chunk = 1000
cycles = (size // chunk) + 1

# Lemmatization

In [7]:
if texts_col_lem not in df:
    df[texts_col_lem] = ""

In [8]:
for i in tqdm.tnrange(cycles):
    texts = df.loc[df.index[i * chunk:], texts_col].values.tolist()
    
    with mp.Pool() as pool:
        result = pool.map(lemmatize, texts)
        pool.close()
        pool.join()
        
    df.loc[df.index[i * chunk:], texts_col_lem] = result

  for i in tqdm.tnrange(cycles):


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




### Filtering empty lemmatized documents

In [9]:
df = df[df[texts_col_lem].map(len) > 0]

# Words frequencies
### Creating dictionary

In [10]:
texts_lem = df[texts_col_lem].values.tolist()
corp_dict = gs.corpora.Dictionary(texts_lem)

### Words frequencies calculation

In [11]:
words_freqs = {}
for w, wid in corp_dict.token2id.items():
    words_freqs.update({w: corp_dict.cfs.get(wid)})

### Exporting words frequencies

In [12]:
df_words_freqs = pd.DataFrame(words_freqs, index=[0]).T.reset_index().rename(columns={'index': 'word', 0: 'count'}).sort_values('count', ascending=False)
df_words_freqs.to_excel('results/word-freqs.xlsx', index=False)

# Stop words filtering

In [13]:
stopwords = pd.read_csv('dataset/stop-words.csv').word.tolist()

In [20]:
df[texts_col_lem] = df[texts_col_lem].map(lambda x: list(filter(lambda y: y not in stopwords, x)))

# LDA

### Initialization

In [21]:
# Topics number
k = 100

lda = to.LDAModel(k=k)

### Loading documents

In [22]:
for doc in tqdm.tqdm_notebook(df[texts_col_lem]):
    lda.add_doc(doc)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for doc in tqdm.tqdm_notebook(df[texts_col_lem]):


HBox(children=(FloatProgress(value=0.0, max=9999.0), HTML(value='')))




### LDA training

In [23]:
lda.train(iter=200)

# Words vs topics matrix 

### Getting unique words

In [24]:
uniq_words = np.unique(np.concatenate(df[texts_col_lem].values))
words_topics_distr = list(map(lambda x: lda.get_topic_words(x, uniq_words.size), range(k)))

### Creating matrix

In [25]:
df_words_probs = pd.DataFrame(np.zeros((uniq_words.size, k + 1)))
df_words_probs.columns = ['words'] + df_words_probs.columns.to_list()[:-1]
df_words_probs['words'] = uniq_words
df_words_probs.set_index('words', inplace=True)

### Filling our matrix with the probabilities

In [26]:
def process_words_probs(x):
    words = list(map(lambda y: y[0], x))
    probs = list(map(lambda y: y[1], x))
    return words, probs

for i, words_topics_col in enumerate(tqdm.tqdm_notebook(words_topics_distr)):
    words, probs = process_words_probs(words_topics_col)
    df_words_probs.loc[words, i] = probs

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, words_topics_col in enumerate(tqdm.tqdm_notebook(words_topics_distr)):


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




### Inspecting the results

In [31]:
df_words_probs.round(2).head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ABBA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Exporting

In [32]:
df_words_probs.to_excel('results/words-vs-topics-matrix.xlsx')

# Documents vs topics matrix

### Creating matrix

In [33]:
docs_vs_topics = list(map(lambda doc: doc.get_topic_dist(), lda.docs))
df_docs_topics = pd.DataFrame(docs_vs_topics)
df_docs_topics.insert(0, 'documents', df[texts_col])

### Inspecting matrix

In [34]:
df_docs_topics.round(2).head()

Unnamed: 0,documents,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,Кайли Миноуг выпустит новый альбом. Об этом со...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0
1,"В среду, 8 октября, в связи с празднованием Дн...",0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0
2,Представители одной из крупнейших в мире компа...,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.21,0.0
3,Председатель ЛДПР Владимир Жириновский опровер...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.21,0.0
4,"В Чите, где в понедельник должны предъявить об...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.02


### Exporting

In [113]:
df_docs_topics.to_excel('results/docs-vs-topics-matrix.xlsx')