## Global constants

In [1]:
from os import getenv, environ, getcwd

In [2]:
from os import chdir
chdir('../')
getcwd()

'/home/mkls/code/coursework'

In [3]:
getenv('PYTHONPATH')

'/home/mkls/code/coursework/:/home/mkls/code/coursework/src/pkg/'

## Necessary imports

In [1]:
import numpy as np
import pandas as pd
import spacy
import pickle
from itertools import tee
from preproc import Preprocessor, LangEnum
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from models import Top2VecW

## Preprocessing

In [2]:
with open('../loaded_texts', 'rb') as f:
    texts = pickle.load(f)  # Preloaded texts

import re
for i, text in enumerate(texts):
    r_text, n = re.subn(r'[0-9]+:[0-9]+', '', text)
    texts[i] = r_text

In [3]:
preprocessor = Preprocessor(language=LangEnum.RU, tokenize_ents=False)
doc_iter, dictionary = preprocessor.preprocess_texts(texts);
preprocessed_texts = list(doc_iter);

## Fit Top2Vec model

In [4]:
from gensim.utils import simple_preprocess

def crutch_for_top2vec(doc):
    return doc.split()

In [5]:
top2vec = Top2VecW(documents=list(map(str, preprocessed_texts)),
                   embedding_model='doc2vec',
                   workers=12,
                   tokenizer=crutch_for_top2vec)
topics = top2vec.get_topics(num_topics=10)
topics, t_copy, t_copy_1 = tee(topics, 3)

ids = list(map(lambda x: x[0], topics))
words = list(map(lambda x: x[1][0], t_copy))
scores = list(map(lambda x: x[1][1], t_copy_1))

2021-04-26 09:12:32,757 - top2vec - INFO - Pre-processing documents for training
2021-04-26 09:12:33,723 - top2vec - INFO - Creating joint document/word embedding
2021-04-26 09:20:37,508 - top2vec - INFO - Creating lower dimension embedding of documents
2021-04-26 09:21:08,854 - top2vec - INFO - Finding dense areas of documents
2021-04-26 09:21:10,939 - top2vec - INFO - Finding topics


In [6]:
topics = [[str(t) for t in w.tolist()] for w in words]

In [8]:
topics

[['самолёт',
  'росавиации',
  'авиационный',
  'авиакомпания',
  'разбиться',
  'взлётно-посадочный',
  'ту-204',
  'аэропорт',
  'внуково',
  'посадка',
  'росавиация',
  'воздушный',
  'экипаж',
  'як-42',
  'крушение',
  'red_wings',
  'авиакатастрофа',
  'боинг',
  'рейс',
  'ssj-100',
  'борт',
  'авиадиспетчерский',
  'катастрофа',
  'взлёт',
  'самописец',
  'boeing',
  'диспетчер',
  'пассажир',
  'полёт',
  'sukhoi',
  'аэродром',
  'мак',
  'пассажирский',
  'atr-72',
  'шасси',
  'лётный',
  'лайнер',
  'неполадка',
  'бортовый',
  'кабина',
  'чартерный',
  'двигатель',
  'приземлиться',
  'авиаперевозчик',
  'superjet-100',
  'эксплуатант',
  'минтранс',
  'приземление',
  'пилот',
  'авиаузел'],
 ['возгорание',
  'пожар',
  'пожарный',
  'гореть',
  'тушение',
  'локализовать',
  'горение',
  'потушить',
  'кровля',
  'евгений_бобылев',
  'кв',
  'тушить',
  'квадратный',
  'загореться',
  'этаж',
  'ликвидировать',
  'ангар',
  'мчс',
  'пламя',
  'жилой',
  'эвакуирова

## Wordcloud visualization

In [None]:
# Something here constantly kills the kernel…
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]

cloud = WordCloud(stopwords=stop_list,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud, interpolation='bilinear')
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

## Topic coherence metric value

In [51]:
# dictionary = Dictionary([[str(token) for token in text] for text in preprocessed_texts[:1000]], prune_at=20000000000000)

In [23]:
# corpus = [dictionary.doc2bow(text) for text in [[str(token) for token in text] for text in preprocessed_texts]]

In [9]:
# u_mass = CoherenceModel(#topics=[[str(t) for t in w.tolist()] for w in words],
#                                  topics=topics,
#                                  #corpus=corpus,
#                                  texts=[[str(token) for token in text] for text in preprocessed_texts[:1000]],
#                                  topn=10,
#                                  dictionary=dictionary,
#                                  coherence='u_mass').get_coherence()
c_v = CoherenceModel(#topics=[[str(t) for t in w.tolist()] for w in words],
                                 topics=topics,
                                 #corpus=corpus,
                                 texts=[[str(token) for token in text] for text in preprocessed_texts],
                                 topn=10,
                                 dictionary=dictionary,
                                 coherence='c_v').get_coherence()

In [10]:
# u_mass, c_v
c_v

0.8654223728829367

In [18]:
with open('top2vec_doc2vec_1.txt', 'w') as f:
    for topic in topics:
        f.write(" ".join(topic[:10]) + "\n")

In [19]:
top2vec.__model__.save('top2vec_doc2vec_interfax_1')