<a href="https://colab.research.google.com/github/marriamaslova/compling_nlp_hse_course/blob/master/notebooks/topic_modelling/homework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Домашнее задание  № 5. Матричные разложения/Тематическое моделирование

### Задание № 1 (4 балла)

Попробуйте матричные разложения с 4 классификаторами - SGDClassifier, KNeighborsClassifier,  RandomForest, ExtraTreesClassifier (про него подробнее почитайте в документации, он похож на RF). Используйте и NMF и SVD. Сравните результаты на кросс-валидации и выберите лучшее сочетание.

В итоге у вас должно получиться, как минимум 8 моделей (два разложения на каждый классификатор). Используйте 1 и те же параметры кросс-валидации. Параметры векторизации, параметры K в матричных разложениях, параметры классификаторов могут быть разными между экспериментами.

Можете взять поменьше данных, если все будет обучаться слишком долго (не ставьте параметр K слишком большим в NMF, иначе точно будет слишком долго)

In [None]:
!pip install gensim pymorphy2 seaborn pyLDAvis razdel

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import gensim
import pandas as pd
import numpy as np
from pymorphy2 import MorphAnalyzer
import pyLDAvis.gensim_models
from collections import Counter
from string import punctuation
from razdel import tokenize as razdel_tokenize
from IPython.display import Image
from IPython.core.display import HTML 
from sklearn.decomposition import TruncatedSVD, NMF, PCA, LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold
from matplotlib import pyplot as plt
import seaborn as sns
morph = MorphAnalyzer()
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
def normalize(text):
    normalized_text = [word.text.strip(punctuation) for word \
                                                            in razdel_tokenize(text)]
    normalized_text = [word.lower() for word in normalized_text if word and len(word) < 20 ]
    normalized_text = [morph.parse(word)[0].normal_form for word in normalized_text]
    return ' '.join(normalized_text)

In [None]:
def eval_table(X, y, pipeline, N=6):
    # зафиксируем порядок классов
    labels = list(set(y))
    
    # метрики отдельных фолдов будет хранить в табличке
    fold_metrics = pd.DataFrame(index=labels)
    # дополнительно также соберем таблицу ошибок
    errors = np.zeros((len(labels), len(labels)))
    
    # создаем стратегию кросс-валидации
    # shuffle=True (перемешивание) - часто критично важно указать
    # т.к. данные могут быть упорядочены и модель на этом обучится
    kfold = StratifiedKFold(n_splits=N, shuffle=True, )
    
    for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
        # fit-predict как и раньше, но сразу пайплайном
        pipeline.fit(X[train_index], y[train_index])
        preds = pipeline.predict(X[test_index])
        
        # записываем метрику и индекс фолда
        fold_metrics[f'precision_{i}'] = precision_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'recall_{i}'] = recall_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'f1_{i}'] = f1_score(y[test_index], preds, labels=labels, average=None)
        errors += confusion_matrix(y[test_index], preds, labels=labels, normalize='true')
    
    # таблица для усредненных значений
    # тут мы берем колонки со значениями и усредняем их
    # часто также все метрики сразу суммируют и в конце просто делят на количество фолдов
    # но мы тут помимо среднего также хотим посмотреть на стандартное отклонение
    # чтобы понять как сильно варьируются оценки моделей
    result = pd.DataFrame(index=labels)
    result['precision'] = fold_metrics[[f'precision_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['precision_std'] = fold_metrics[[f'precision_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['recall'] = fold_metrics[[f'recall_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['recall_std'] = fold_metrics[[f'recall_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['f1'] = fold_metrics[[f'f1_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['f1_std'] = fold_metrics[[f'f1_{i}' for i in range(N)]].std(axis=1).round(2)
    
    # добавим одну колонку со средним по всем классам
    result.loc['mean'] = result.mean().round(2)
    # проценты ошибок просто усредняем
    errors /= N
    
    return result, errors

In [None]:
data = pd.read_csv('avito_category_classification.csv')
data['description_norm'] = data['description'].apply(normalize)

RandomForest

In [None]:
pipeline_svd_rf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=10))
])

In [None]:
metrics_svd_rf, errors_svd_rf = eval_table(data['description_norm'], data['category_name'], pipeline_svd_rf)

In [None]:
pipeline_nmf_rf = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(100)),
    ('clf', RandomForestClassifier(n_estimators=200, max_depth=6))
])

In [None]:
metrics_nmf_rf, errors_nmf_rf = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_rf)

SGD

In [None]:
pipeline_svd_sgd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', SGDClassifier(max_iter=900, tol=1e-3))
])

In [None]:
metrics_svd_sgd, errors_svd_sgd = eval_table(data['description_norm'], data['category_name'], pipeline_svd_sgd)

In [None]:
pipeline_nmf_sgd = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(90)),
    ('clf', SGDClassifier(max_iter=900, tol=1e-3))
])

In [None]:
metrics_nmf_sgd, errors_nmf_sgd = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_sgd)

KNeighbors

In [None]:
pipeline_svd_kn = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', KNeighborsClassifier(n_neighbors=7))
])

In [None]:
metrics_svd_kn, errors_svd_kn = eval_table(data['description_norm'], data['category_name'], pipeline_svd_kn)

In [None]:
pipeline_nmf_kn = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(90)),
    ('clf', KNeighborsClassifier(n_neighbors=7))
])

In [None]:
metrics_nmf_kn, errors_nmf_kn = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_kn)

ExtraTrees

In [None]:
pipeline_svd_et = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', ExtraTreesClassifier(n_estimators=100, random_state=0))
])

In [None]:
metrics_svd_et, errors_svd_et = eval_table(data['description_norm'], data['category_name'], pipeline_svd_et)

In [None]:
pipeline_nmf_et = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), min_df=3, max_df=0.3)),
    ('tfidf', TfidfTransformer()),
    ('decomposition', NMF(90)),
    ('clf', ExtraTreesClassifier(n_estimators=100, random_state=0))
])

In [None]:
metrics_nmf_et, errors_nmf_et = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_et)

Метрики

RandomForest

In [None]:
metrics_svd_rf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Бытовая техника,0.85,0.2,0.06,0.04,0.1,0.07
Товары для детей и игрушки,0.84,0.08,0.17,0.02,0.28,0.03
Ремонт и строительство,0.66,0.14,0.09,0.04,0.15,0.06
Мебель и интерьер,0.79,0.25,0.03,0.01,0.05,0.02
Квартиры,0.87,0.03,0.89,0.02,0.88,0.02
Автомобили,0.88,0.02,0.6,0.05,0.71,0.03
Предложение услуг,0.72,0.05,0.49,0.05,0.58,0.02
"Одежда, обувь, аксессуары",0.47,0.02,0.76,0.03,0.58,0.02
Детская одежда и обувь,0.45,0.02,0.69,0.03,0.54,0.02
Телефоны,0.95,0.04,0.37,0.04,0.53,0.04


In [None]:
metrics_nmf_rf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Бытовая техника,1.0,0.0,0.03,0.02,0.06,0.03
Товары для детей и игрушки,0.87,0.06,0.35,0.05,0.49,0.06
Ремонт и строительство,0.73,0.22,0.05,0.03,0.1,0.05
Мебель и интерьер,0.78,0.11,0.18,0.05,0.29,0.07
Квартиры,0.86,0.03,0.95,0.03,0.9,0.02
Автомобили,0.9,0.05,0.77,0.04,0.83,0.04
Предложение услуг,0.7,0.05,0.5,0.06,0.58,0.03
"Одежда, обувь, аксессуары",0.44,0.02,0.81,0.04,0.57,0.02
Детская одежда и обувь,0.62,0.04,0.7,0.04,0.66,0.02
Телефоны,0.94,0.07,0.33,0.04,0.49,0.05


SGD

In [None]:
metrics_svd_sgd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Бытовая техника,0.65,0.09,0.51,0.06,0.57,0.03
Товары для детей и игрушки,0.78,0.05,0.63,0.04,0.69,0.01
Ремонт и строительство,0.61,0.12,0.46,0.07,0.52,0.05
Мебель и интерьер,0.69,0.08,0.63,0.07,0.65,0.04
Квартиры,0.96,0.02,0.96,0.01,0.96,0.01
Автомобили,0.88,0.02,0.91,0.03,0.89,0.01
Предложение услуг,0.79,0.04,0.75,0.09,0.77,0.06
"Одежда, обувь, аксессуары",0.72,0.04,0.78,0.03,0.74,0.02
Детская одежда и обувь,0.73,0.03,0.79,0.04,0.76,0.02
Телефоны,0.82,0.05,0.81,0.04,0.81,0.03


In [None]:
metrics_nmf_sgd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Бытовая техника,0.45,0.36,0.11,0.09,0.14,0.1
Товары для детей и игрушки,0.69,0.25,0.44,0.17,0.48,0.08
Ремонт и строительство,0.17,0.16,0.07,0.16,0.08,0.16
Мебель и интерьер,0.62,0.12,0.27,0.19,0.33,0.2
Квартиры,0.74,0.23,0.94,0.06,0.81,0.15
Автомобили,0.74,0.11,0.73,0.18,0.72,0.09
Предложение услуг,0.68,0.18,0.32,0.22,0.37,0.09
"Одежда, обувь, аксессуары",0.64,0.06,0.68,0.05,0.66,0.03
Детская одежда и обувь,0.57,0.12,0.74,0.1,0.63,0.04
Телефоны,0.68,0.08,0.53,0.14,0.58,0.08


KNeighbors

In [None]:
metrics_svd_kn

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Бытовая техника,0.32,0.07,0.25,0.07,0.28,0.06
Товары для детей и игрушки,0.57,0.09,0.25,0.03,0.35,0.04
Ремонт и строительство,0.3,0.02,0.16,0.04,0.21,0.04
Мебель и интерьер,0.37,0.06,0.26,0.06,0.31,0.05
Квартиры,0.94,0.03,0.72,0.05,0.82,0.03
Автомобили,0.56,0.05,0.58,0.06,0.57,0.05
Предложение услуг,0.58,0.06,0.58,0.08,0.58,0.07
"Одежда, обувь, аксессуары",0.47,0.02,0.59,0.04,0.52,0.03
Детская одежда и обувь,0.46,0.01,0.63,0.02,0.53,0.01
Телефоны,0.79,0.02,0.3,0.04,0.43,0.05


In [None]:
metrics_nmf_kn

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Бытовая техника,0.21,0.02,0.23,0.05,0.22,0.03
Товары для детей и игрушки,0.52,0.06,0.33,0.04,0.4,0.04
Ремонт и строительство,0.27,0.03,0.27,0.06,0.27,0.04
Мебель и интерьер,0.36,0.04,0.27,0.05,0.3,0.04
Квартиры,0.85,0.07,0.84,0.05,0.85,0.06
Автомобили,0.58,0.09,0.7,0.07,0.63,0.08
Предложение услуг,0.43,0.04,0.69,0.04,0.53,0.04
"Одежда, обувь, аксессуары",0.54,0.01,0.58,0.03,0.56,0.01
Детская одежда и обувь,0.55,0.03,0.54,0.03,0.55,0.02
Телефоны,0.68,0.04,0.44,0.03,0.53,0.03


ExtraTrees

In [None]:
metrics_svd_et

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Бытовая техника,0.53,0.05,0.13,0.03,0.21,0.05
Товары для детей и игрушки,0.66,0.08,0.15,0.04,0.25,0.05
Ремонт и строительство,0.58,0.14,0.09,0.03,0.15,0.05
Мебель и интерьер,0.83,0.13,0.12,0.02,0.21,0.03
Квартиры,0.74,0.02,0.81,0.02,0.77,0.01
Автомобили,0.79,0.07,0.46,0.05,0.58,0.04
Предложение услуг,0.89,0.01,0.27,0.04,0.42,0.04
"Одежда, обувь, аксессуары",0.46,0.01,0.72,0.02,0.56,0.01
Детская одежда и обувь,0.44,0.02,0.72,0.02,0.54,0.02
Телефоны,0.87,0.05,0.3,0.07,0.44,0.07


In [None]:
metrics_nmf_et

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Бытовая техника,0.67,0.12,0.3,0.1,0.41,0.11
Товары для детей и игрушки,0.76,0.03,0.61,0.04,0.68,0.04
Ремонт и строительство,0.57,0.06,0.44,0.05,0.5,0.05
Мебель и интерьер,0.69,0.05,0.52,0.03,0.59,0.04
Квартиры,0.92,0.04,0.96,0.02,0.94,0.02
Автомобили,0.85,0.03,0.91,0.02,0.88,0.02
Предложение услуг,0.69,0.04,0.75,0.03,0.72,0.03
"Одежда, обувь, аксессуары",0.69,0.02,0.8,0.03,0.74,0.02
Детская одежда и обувь,0.74,0.02,0.77,0.03,0.76,0.02
Телефоны,0.79,0.03,0.78,0.06,0.78,0.04


Выводы:

(перед собственно итогами надо отметить, что пришлось поэкспериментировать с KNeighborsClassifier, поскольку дефолтный n_neighbors=3 дал плохие показатели по метрикам. С увеличением параметра результаты улучшились, хотя разница между n_neighbors=5 и n_neighbors=7 получилась невелика).

Если смотреть метрики по отдельным тематическим категориям, то результаты сильно разнятся от комбинации к комбинации и не всегда коррелируют с показателями, относящимися к среднему. А вот в графе mean можно выделить лидеров.

В комбинации с svd-разложением лучший результат показал SGDClassifier с результатами (mean) precision 0.76, recall 0.72, F1 0.74

В комбинации с nmf-разложением лучший результат показал ExtraTreesClassifier с результатами (mean) precision 0.74, recall 0.68, F1 0.70

### Задание № 2 (6 баллов)

В Gensim тоже можно добавить нграммы и tfidf. Постройте 1 модель без них (как в семинаре) и еще 3 модели (1 с нграммами, 1 с tfidf и 1 с нграммами и с tfidf). Сранивте качество с помощью метрик (перплексия, когерентность) и на глаз. Определите лучшую модель. Для каждой модели выберите 1 самую красивую на ваш взгляд тему.

Используйте данные википедии из семинара. Можете взять поменьше данных, если все обучается долго.

Важное требование - получившиеся модели не должны быть совсем плохими. Если хороших тем не получается, попробуйте настроить гиперпараметры, отфильтровать словарь по-другому. 

Нграммы добавляются вот так (перед созданиеv словаря)

In [None]:
texts = [text.split() for text in texts]
ph = gensim.models.Phrases(texts, scoring='npmi', threshold=0.4) # threshold можно подбирать
p = gensim.models.phrases.Phraser(ph)
ngrammed_texts = p[texts] 

# ! не забудьте, что далее вам нужно будет использовать ngrammed_texts

!! В модели с нграммами вначале посмотрите, что получается после преобразования
Если вы выведите несколько первых текстов в ngrammed_texts, то там должно быть что-то такое:

In [None]:
[text for text in ngrammed_texts[:3]]
>> [['новостройка',
  'нижегородский_область', # нграм
  'новостро́йка',
  '—',
  'сельский',
  'посёлок',
  'в',
  'дивеевский_район', # нграм
  'нижегородский_область', #нграмм
  'входить',
  'в',
  'состав_сатисский', #нграмм
  'сельсовет',
  'посёлок',
  'расположить',
  'в',
  '12,5',
  'километр',
....

Если вы не видите нграммов, то попробуйте изменить параметр threshold

Tfidf добавляется вот так (после векторизации и перед обучением lda)

In [None]:
tfidf = gensim.models.TfidfModel(corpus, id2word=dictionary, )
corpus = tfidf[corpus]

Gensim без ngram и TfIdf

In [None]:
texts = open('wiki_data.txt').read().splitlines()[:5000]
texts = ([normalize(text) for text in texts])

In [None]:
dictinary = gensim.corpora.Dictionary((text.split() for text in texts))

In [None]:
dictinary.filter_extremes(no_above=0.5, no_below=10)
dictinary.compactify()

In [None]:
corpus = [dictinary.doc2bow(text.split()) for text in texts]

In [None]:
lda = gensim.models.LdaModel(corpus, 200, id2word=dictinary, passes=5)

In [None]:
lda.print_topics()

[(86,
  '0.008*"общество" + 0.008*"развитие" + 0.008*"организация" + 0.007*"уровень" + 0.007*"цель" + 0.006*"состояться" + 0.006*"провести" + 0.006*"решение" + 0.006*"дальнейший" + 0.006*"результат"'),
 (12,
  '0.046*"бой" + 0.039*"сражение" + 0.036*"флотилия" + 0.025*"удар" + 0.023*"английский" + 0.022*"флот" + 0.017*"нога" + 0.017*"боец" + 0.014*"развивать" + 0.014*"фрегат"'),
 (199,
  '0.021*"император" + 0.019*"король" + 0.011*"фон" + 0.011*"против" + 0.009*"правление" + 0.008*"iii" + 0.008*"смерть" + 0.008*"власть" + 0.008*"франция" + 0.008*"сын"'),
 (35,
  '0.088*"карта" + 0.076*"нил" + 0.057*"процесс" + 0.030*"параметр" + 0.027*"грязь" + 0.024*"обработка" + 0.023*"вследствие" + 0.022*"классификация" + 0.019*"сочетание" + 0.017*"график"'),
 (13,
  '0.020*"брат" + 0.017*"маргарита" + 0.015*"мать" + 0.013*"павел" + 0.011*"дядя" + 0.011*"римский" + 0.010*"внук" + 0.010*"дед" + 0.010*"хабаровский" + 0.010*"краевой"'),
 (135,
  '0.114*"мир" + 0.078*"чемпионат" + 0.062*"чемпион" + 0.06

Gensim с ngram

In [None]:
texts = [text.split() for text in texts]

In [None]:
ph = gensim.models.Phrases(texts, scoring='npmi', threshold=0.4) # threshold можно подбирать
p = gensim.models.phrases.Phraser(ph)

In [None]:
def make_phrases(texts):
    return [p[text] for text in texts]

In [None]:
ngrammed_texts = make_phrases(texts)

In [None]:
dictinary_ng = gensim.corpora.Dictionary(ngrammed_texts)

In [None]:
dictinary_ng.filter_extremes(no_above=0.1, no_below=10)
dictinary_ng.compactify()

In [None]:
texts_ng = ngrammed_texts

In [None]:
corpus_ng = [dictinary_ng.doc2bow(text) for text in texts_ng]  

In [None]:
lda_ng = gensim.models.LdaModel(corpus_ng, 200, id2word=dictinary_ng, passes=5)

In [None]:
lda_ng.print_topics()

[(22,
  '0.116*"экспедиция" + 0.027*"виктория" + 0.027*"полярный" + 0.026*"южный_полюс" + 0.023*"судно" + 0.020*"1910" + 0.019*"поход" + 0.018*"1911" + 0.018*"скотт" + 0.018*"плавание"'),
 (78,
  '0.137*"строительный" + 0.105*"боливия" + 0.068*"всеобщий" + 0.061*"аргентинский" + 0.049*"аргентина" + 0.040*"голосование" + 0.037*"тайный" + 0.035*"местечко" + 0.034*"де" + 0.028*"акт"'),
 (177,
  '0.090*"профессор" + 0.088*"институт" + 0.055*"университет" + 0.027*"лаборатория" + 0.024*"академия_наука" + 0.023*"учёный" + 0.021*"избрать" + 0.020*"окончить" + 0.019*"сибирский" + 0.019*"кафедра"'),
 (20,
  '0.063*"команда" + 0.038*"капитан" + 0.036*"штат" + 0.026*"нью-йорк" + 0.026*"американский" + 0.019*"сша" + 0.014*"рейс" + 0.013*"джеймс" + 0.011*"судно" + 0.011*"лето"'),
 (0,
  '0.060*"бог" + 0.028*"герой" + 0.026*"мир" + 0.012*"богиня" + 0.012*"глаз" + 0.011*"душа" + 0.009*"враг" + 0.008*"книга" + 0.008*"человеческий" + 0.008*"медведь"'),
 (76,
  '0.202*"открытый_чемпионат" + 0.115*"пара" 

Gensim с TfIdf

In [None]:
dictinary_tfidf = gensim.corpora.Dictionary((text.split() for text in texts))

In [None]:
dictinary_tfidf.filter_extremes(no_above=0.1, no_below=10)
dictinary_tfidf.compactify()

In [None]:
corpus = [dictinary.doc2bow(text.split()) for text in texts]

In [None]:
tfidf = gensim.models.TfidfModel(corpus, id2word=dictinary_tfidf, )
corpus_tfidf = tfidf[corpus]

In [None]:
lda_tfidf = gensim.models.LdaModel(corpus, 200, id2word=dictinary_tfidf, passes=5)

In [None]:
lda_tfidf.print_topics()

[(54,
  '0.161*"университет" + 0.055*"наука" + 0.046*"факультет" + 0.041*"профессор" + 0.027*"образование" + 0.027*"учебный" + 0.026*"кафедра" + 0.023*"институт" + 0.021*"студент" + 0.019*"преподаватель"'),
 (167,
  '0.086*"формула" + 0.076*"напряжение" + 0.055*"сопротивление" + 0.048*"ток" + 0.046*"таджикистан" + 0.037*"потеря" + 0.036*"узбекский" + 0.034*"коэффициент" + 0.024*"параметр" + 0.022*"комикс"'),
 (114,
  '0.118*"партия" + 0.037*"выборы" + 0.030*"политический" + 0.021*"движение" + 0.019*"депутат" + 0.017*"хвост" + 0.017*"власть" + 0.016*"коммунист" + 0.016*"член" + 0.016*"правительство"'),
 (176,
  '0.052*"европейский" + 0.052*"александрович" + 0.046*"мюнхен" + 0.044*"леонид" + 0.041*"европа" + 0.035*"конгресс" + 0.035*"ницца" + 0.035*"союз" + 0.033*"уродить" + 0.030*"архив"'),
 (79,
  '0.043*"итальянский" + 0.030*"“" + 0.029*"„" + 0.021*"исполнитель" + 0.017*"руб" + 0.017*"музыкант" + 0.011*"тираж" + 0.011*"участник" + 0.011*"экземпляр" + 0.009*"рим"'),
 (164,
  '0.077*"фе

Gensim с ngrams и TfIdf

In [None]:
ph = gensim.models.Phrases(texts, scoring='npmi', threshold=0.4) # threshold можно подбирать
p = gensim.models.phrases.Phraser(ph)

In [None]:
def make_phrases(texts):
    return [p[text] for text in texts]

In [None]:
ngrammed_texts_2 = make_phrases(texts)

In [None]:
dictinary_ngtfidf = gensim.corpora.Dictionary(ngrammed_texts_2)

In [None]:
dictinary_ngtfidf.filter_extremes(no_above=0.1, no_below=10)
dictinary_ngtfidf.compactify()

In [None]:
texts_ng = ngrammed_texts_2

In [None]:
corpus_pre = [dictinary_ngtfidf.doc2bow(text) for text in texts_ng]  

In [None]:
tfidf = gensim.models.TfidfModel(corpus_pre, id2word=dictinary_ngtfidf, )
corpus_ngtfidf = tfidf[corpus_pre]

In [None]:
lda_ngtfidf = gensim.models.LdaModel(corpus_ngtfidf, 500, id2word=dictinary_ngtfidf, passes=5)

In [None]:
lda_ngtfidf.print_topics()

[(445,
  '0.081*"академик" + 0.049*"1843" + 0.040*"помешать" + 0.019*"петербург" + 0.008*"портрет" + 0.008*"серебряный_медаль" + 0.000*"внешний" + 0.000*"николай" + 0.000*"живопись" + 0.000*"академия"'),
 (82,
  '0.220*"игра_1994" + 0.126*"зимний_олимпийский" + 0.111*"лиллехамера_норвегия" + 0.036*"свидетельствовать" + 0.035*"пруссия" + 0.031*"офицерский" + 0.027*"базироваться" + 0.023*"датский" + 0.022*"долина" + 0.021*"исторически"'),
 (209,
  '0.240*"песня" + 0.086*"любовь" + 0.081*"певица" + 0.076*"записать" + 0.059*"релиз" + 0.042*"выпустить" + 0.039*"студия" + 0.037*"издать" + 0.024*"включить" + 0.023*"первоначальный"'),
 (117,
  '0.075*"фильм" + 0.034*"режиссёр" + 0.024*"сериал" + 0.020*"актёр" + 0.015*"деньга" + 0.014*"родитель" + 0.014*"пьеса" + 0.014*"нью-йорк" + 0.014*"роль" + 0.013*"я"'),
 (36,
  '0.108*"княжество" + 0.090*"преступление" + 0.070*"преступник" + 0.053*"раб" + 0.049*"лишение_свобода" + 0.035*"ночью" + 0.032*"серийный" + 0.027*"известно_что" + 0.027*"убийство" 

Примечание: эти темы мне, конечно, нравятся меньше, чем предыдущие. Но я меняла все возможные параметры, и это сам адекватный вариант из всего, что я увидела среди топиков. Здесь хотя бы немалая часть тем выглядит адекватной.