# Загрузка данных

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('bbc.csv')

In [3]:
data.loc[0]['description']

'The Ukrainian president says the country will not forgive or forget those who murder its civilians.'

In [4]:
texts = data['description']

In [5]:
texts

0        The Ukrainian president says the country will ...
1        Jeremy Bowen was on the frontline in Irpin, as...
2        One of the world's biggest fertiliser firms sa...
3        The parents of the Manchester Arena bombing's ...
4        Consumers are feeling the impact of higher ene...
                               ...                        
35855    Lauren Bell takes a career-best 5-37 as Englan...
35856    Andy Murray begins his Wimbledon farewell on T...
35857    Emma Raducanu completes a dominant 6-1 6-2 vic...
35858    John White did the League and Cup Double and w...
35859    The National Rally party is now a dominant for...
Name: description, Length: 35860, dtype: object

# Обработка данных и обучение LDA модели

### Я делю датасет на 10 тем

In [6]:
import gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import preprocess_string

In [30]:
tokenized_texts = [preprocess_string(text) for text in texts]

dictionary = Dictionary(tokenized_texts)

for text in tokenized_texts:
    if 'sai' in text:
        text.remove('sai')
    if 'peopl' in text:
        text.remove('peopl')
    if 'year' in text:
        text.remove('year')
    if 'bbc' in text:
        text.remove('bbc')
    if 'new' in text:
        text.remove('new')

corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3)

print(lda_model.print_topics(num_words=3))

[(0, '0.007*"polic" + 0.007*"attack" + 0.006*"old"'), (1, '0.012*"elect" + 0.008*"parti" + 0.007*"leader"'), (2, '0.015*"england" + 0.014*"win" + 0.013*"world"')]


# Вывод самых популярных слов из каждой темы

In [31]:
for i, theme_words in enumerate(lda_model.print_topics(num_words=5)):
    print(f'Тема {i+1}: ')
    words = theme_words[1].split(' + ') 
    for word_with_weight in words:
        word, weight = word_with_weight.split('*')
        word = word.strip('"') 
        print(f'  {word} ({weight})')

Тема 1: 
  0.007 ("polic")
  0.007 ("attack")
  0.006 ("old")
  0.005 ("kill")
  0.005 ("di")
Тема 2: 
  0.012 ("elect")
  0.008 ("parti")
  0.007 ("leader")
  0.007 ("labour")
  0.006 ("gener")
Тема 3: 
  0.015 ("england")
  0.014 ("win")
  0.013 ("world")
  0.012 ("final")
  0.011 ("leagu")


# Метрика UMass

In [32]:
from gensim.models.coherencemodel import CoherenceModel
coherence_model_umass = CoherenceModel(model=lda_model, texts=tokenized_texts, dictionary=dictionary, coherence='u_mass')
coherence_score_umass = coherence_model_umass.get_coherence()
print(f'Coherence score (UMass): {coherence_score_umass}')

Coherence score (UMass): -4.177533785442077


# Удобная интерактивная визуализация для LDA модели

In [33]:
import pyLDAvis
import pyLDAvis.gensim_models

In [34]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

In [13]:
pyLDAvis.save_html(vis, 'bbc_lda_visualization.html')
