# Topic Modeling
- Topic modeling is a type of statistical modeling for discovering the abstract "topics" that occur in a collection of documents.

In [None]:

import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


In [None]:
data = pd.read_csv('news_articles.csv')
data.head()

In [None]:
data.info()

In [None]:
articles = data['content']

In [None]:
# text cleanup

# remove special characters
articles = articles.str.lower().apply(lambda x: re.sub(r'[^\w\s]', '', x))

# stop words removal
en_stopwords = stopwords.words('english')
articles = articles.apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in en_stopwords]))

# tokenization
articles = articles.apply(lambda x: word_tokenize(x))

# stemming (done for speed as we have a lot of data)
ps = PorterStemmer()
articles = articles.apply(lambda tokens: [ps.stem(token) for token in tokens])

In [None]:
articles.head()

In [None]:
# create dictionary of every unique word in our dataset
dictionary = corpora.Dictionary(articles)
print(dictionary)

In [None]:
# crete document term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in articles]
print(doc_term_matrix)

## Latent Dirichlet Allocation (LDA)
- LDA is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar.
- LDA is a three-level hierarchical Bayesian model, in which each item of a collection is modeled as a finite mixture over an underlying set of topics.
- Each topic is, in turn, modeled as an infinite mixture over an underlying set of topic probabilities.
- The topic probabilities are drawn from a Dirichlet distribution, and the words are drawn from the topics.
- LDA assumes that the words of each document are generated by selecting a topic at random and then selecting a word from the topic at random.

In [None]:
import gensim
import gensim.corpora as corpora

In [None]:
# set number of topics we want to extract
num_topics = 2

In [None]:
# create LDA model using gensim
lda_model = gensim.models.LdaModel(corpus=doc_term_matrix, id2word=dictionary, num_topics=num_topics)


In [None]:
lda_model.print_topics(num_topics=num_topics, num_words=5)

# LSA - Latent Semantic Analysis
- LSA is a technique in natural language processing, in particular in vectorial semantics, of analyzing relationships between a set of documents and the terms they contain by producing a set of concepts related to the documents and terms.
- LSA assumes that words that are close in meaning will occur in similar pieces of text.
- A matrix containing word counts per paragraph (rows represent unique words and columns represent each paragraph) is constructed from a large piece of text and a mathematical technique called singular value decomposition (SVD) is used to reduce the number of rows while preserving the similarity structure among columns.

In [None]:
from gensim.models import LsiModel

In [None]:
# create LSA model
lsa_model = LsiModel(corpus=doc_term_matrix, id2word=dictionary, num_topics=num_topics)

In [None]:
lsa_model.print_topics(num_topics=num_topics, num_words=5)

In [None]:
# determining the optimal number of topics
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

In [None]:
coherence_values = []
model_list = []

min_topics = 2
max_topics = 11

In [None]:
for num_topics_i in range(min_topics, max_topics):
    model = LsiModel(corpus=doc_term_matrix, id2word=dictionary, num_topics=num_topics_i)
    model_list.append(model)
    coherence_model = CoherenceModel(model=model, texts=articles, dictionary=dictionary, coherence='c_v')
    coherence_values.append(coherence_model.get_coherence())

In [None]:
plt.plot(range(min_topics, max_topics), coherence_values)

In [None]:
# given the results in the previous plot graphic we can determine which is the optimal number of topics
final_num_topics = 9
final_lsa_model = LsiModel(corpus=doc_term_matrix, id2word=dictionary, num_topics=final_num_topics)
final_lsa_model.print_topics(num_topics=final_num_topics, num_words=10)

> IMPORTANT
> - sometimes the most mathematical accurate number of topics is not always the most valuable for the business
> - sometimes the most valuable number of topics is not the most mathematical accurate
> - sometimes it is worth checking manually how the different topics look when you are using different number of topics
> - use coherent scores as well as your intuition and knowledge of the project and the business you work in