# Topic Modelling

### Topic modelling finds patterns in large texts and defines what are the mains topics in the text
 - gensim, a library mainly dedicated to Topic Modelling

In [1]:
import pandas as pd
import numpy as np
import pickle
import textwrap
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
eng_stopwords = stopwords.words("english")
porterStemmer = PorterStemmer()

In [3]:
def wrap_text(text, max_cols=80):
    text = textwrap.fill(text, width=max_cols)
    print(text)

### dataset containing news of different topics

In [4]:
news_file = "../DATA/google_news.pkl"

with open(news_file, "rb") as f:
    news = pickle.load(f)

In [5]:
news.head()

Unnamed: 0,news
0,It's hard to find a word in the English langua...
1,"Fan Bingbing, the popular Chinese actress who ..."
2,"As CEO of Tesla, Elon Musk's choice of wheels ..."
3,China's People's Liberation Army on Thursday r...
4,"SEOUL, South Korea (AP) — North Korea said Sat..."


In [6]:
wrap_text(news.news[0])

It's hard to find a word in the English language that conjures a more visceral
reaction from investors than " bubble."    This market phenomenon has been
associated with every boom-and-bust cycle throughout history, and has
consistently preyed on the overzeal… [+2826 chars]David Rosenberg, chief
economist and strategist at Gluskin Sheff, sounds the alarm on a Federal
Reserve-induced bubble he sees expanding within the US economy. He points to
inflated corporate balance sheets and looming debt servicing payments in order
to bols…'Follow the bubble': Famed investor David Rosenberg breaks down the
unique way he thinks the next US recession will unfold


In [7]:
# text preprocessing 
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in eng_stopwords]
    
    return tokens

In [9]:
# add new col with tokens 
news["tokens"] = news.news.apply(clean_text)

In [10]:
# peek
news.head(3)

Unnamed: 0,news,tokens
0,It's hard to find a word in the English langua...,"[hard, find, word, english, language, conjures..."
1,"Fan Bingbing, the popular Chinese actress who ...","[fan, bingbing, popular, chinese, actress, mys..."
2,"As CEO of Tesla, Elon Musk's choice of wheels ...","[ceo, tesla, elon, musk, choice, wheels, wo, h..."


## 1. Latent Semantic Analysis (LSA)

Latent Semantic Analysis (LSA) analyzes relationships between a set of documents and the terms they contain by producing a set of concepts (= the topics) related to the documents and terms.
 
You can see it as a kind of PCA applied to your documents. Sometimes, it is also called Latent Semantic Indexing (LSI).


There are two steps in a LSA computation :

- TF-IDF matrix 
- Singular Value Decomposition (the same technique used in PCA)

⚠️  Like in a PCA, the topics don't have an actual meaning : they are more like a combination of words !

In [11]:
from gensim.models import TfidfModel, LsiModel
from gensim.corpora import Dictionary

In [12]:
# Compute the dictionary: this is a dictionary mapping words and their corresponding numbers for later visualisation
word2dict = Dictionary(news.tokens)

In [13]:
i = 1
for key,val in word2dict.items():
    print(f"key : {key}, value : {val}")
    if i == 5:
        break
    i +=1

key : 0, value : alarm
key : 1, value : associated
key : 2, value : balance
key : 3, value : breaks
key : 4, value : bubble


In [14]:
# corpus
corpus = news.tokens
# Create a BOW
bow = [word2dict.doc2bow(token) for token in corpus]
# bow for first doc
print(bow[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 3), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 2), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 2), (50, 1), (51, 1), (52, 1), (53, 1)]


In [20]:
len(set(corpus[0]))

54

In [15]:
# word index = 0, appears 1 time in doc 0 | word index = 4, appears 3 times in doc 0
word2dict[0], word2dict[4]

('alarm', 'bubble')

In [22]:
# Compute a TF-IDF for each token in each document
tfidfModel = TfidfModel(bow)
tfidf = tfidfModel[bow]
# peek tf-idf for each token in doc 0
list(tfidf)[0]

[(0, 0.13086864839025095),
 (1, 0.13086864839025095),
 (2, 0.11328573264418323),
 (3, 0.13086864839025095),
 (4, 0.3926059451707528),
 (5, 0.00818772335862207),
 (6, 0.09004238232941891),
 (7, 0.13086864839025095),
 (8, 0.13086864839025095),
 (9, 0.11328573264418323),
 (10, 0.13086864839025095),
 (11, 0.22657146528836647),
 (12, 0.13086864839025095),
 (13, 0.13086864839025095),
 (14, 0.11328573264418323),
 (15, 0.11328573264418323),
 (16, 0.0957028168981155),
 (17, 0.13086864839025095),
 (18, 0.13086864839025095),
 (19, 0.11328573264418323),
 (20, 0.0957028168981155),
 (21, 0.13086864839025095),
 (22, 0.10300038627939405),
 (23, 0.10300038627939405),
 (24, 0.13086864839025095),
 (25, 0.11328573264418323),
 (26, 0.08541747053332634),
 (27, 0.13086864839025095),
 (28, 0.13086864839025095),
 (29, 0.07513212416853716),
 (30, 0.08150716332640358),
 (31, 0.13086864839025095),
 (32, 0.10300038627939405),
 (33, 0.13086864839025095),
 (34, 0.10300038627939405),
 (35, 0.13086864839025095),
 (36,

In [None]:
# LSA model compute topics
lsa = LsiModel(corpus=tfidf, num_topics=5, id2word=word2dict)

In [None]:
# main topics
lsa.print_topics()

[(0,
  '0.539*"news" + 0.323*"transfer" + 0.225*"latest" + 0.219*"live" + 0.197*"minute" + 0.197*"bulletin" + 0.181*"five" + 0.180*"bbc" + 0.143*"world" + 0.129*"gossip"'),
 (1,
  '-0.406*"bulletin" + -0.406*"minute" + -0.375*"bbc" + -0.372*"five" + -0.290*"world" + 0.225*"transfer" + -0.203*"gmt" + 0.152*"live" + -0.145*"latest" + 0.104*"news"'),
 (2,
  '-0.310*"era" + -0.210*"nl" + -0.188*"al" + -0.157*"probables" + -0.157*"article" + -0.155*"edt" + -0.152*"pitching" + -0.151*"saturday" + -0.142*"central" + -0.141*"sox"'),
 (3,
  '0.314*"assets" + 0.236*"coinbase" + 0.222*"crypto" + 0.157*"boasts" + 0.157*"mouth" + 0.157*"management" + 0.157*"approximately" + 0.157*"speak" + 0.157*"grayscale" + 0.157*"firm"'),
 (4,
  '-0.239*"show" + 0.226*"assets" + -0.221*"full" + 0.170*"coinbase" + -0.133*"facebook" + -0.130*"technology" + 0.124*"crypto" + -0.122*"amazon" + -0.116*"best" + 0.113*"firm"')]

## 2. Latent Dirichlet Allocation (LDA)


Latent Dirichlet Allocation (LDA) is, in a way, an improvement of the LSA. The problem with LSA is that it needs large corpuses of documents to be accurate enough.

LDA is a probabilistic model (Bayesian probabilities) that allows more flexibility on the size of the dataset.

👉🏻 LDA makes two main assumptions:

- **Mixture**: each document is a mixture of topics
- **Sparsity**: each document covers a small set of topics, and each uses only a small subset of words frequently

👉🏻 Then the LDA algorithm follows the following steps:

- Initialization: assign to each document a random (sparse) distribution of topics, and to each a random (sparse) distribution of words
- For each word in each document, compute the most likely topic (according to other words in that document)
- Repeat step 2 until convergence or iteration limit

<figure>
 <img src="../IMAGES/LDA_image.jpeg"   style="width:600px;height:300px;">
</figure>


👉🏻 Expected LDA Output:

- Topic A = 30% dog, 20% frog, 20& insect, 5% cute... = ANIMALS
- Topic B = 30% Olympics, 20% players, 20% beat, 10% corner, 10% Dota, 2% dog = SPORTS
- Topic C = 30% AI, 20% flying, 15% cars, 10% driven, 5% beat, Dota, players = TECH

In [31]:
from gensim.models.ldamodel import LdaModel

In [32]:
# LDA model compute topics
lda = LdaModel(corpus=tfidf, id2word=word2dict, num_topics=4, random_state=42, passes=10)

In [None]:
# main topics
lda.print_topics()

[(0,
  '0.003*"news" + 0.002*"bbc" + 0.002*"minute" + 0.002*"bulletin" + 0.002*"five" + 0.002*"world" + 0.002*"latest" + 0.001*"transfer" + 0.001*"robot" + 0.001*"gmt"'),
 (1,
  '0.002*"news" + 0.001*"technology" + 0.001*"transfer" + 0.001*"assets" + 0.001*"show" + 0.001*"crypto" + 0.001*"starlet" + 0.001*"companies" + 0.001*"live" + 0.001*"wright"'),
 (2,
  '0.002*"news" + 0.002*"transfer" + 0.001*"sign" + 0.001*"live" + 0.001*"cartoon" + 0.001*"gunners" + 0.001*"dow" + 0.001*"apple" + 0.001*"full" + 0.001*"ozil"'),
 (3,
  '0.001*"facebook" + 0.001*"one" + 0.001*"instagram" + 0.001*"amazon" + 0.001*"whatsapp" + 0.001*"new" + 0.001*"chicago" + 0.001*"alexa" + 0.001*"friday" + 0.001*"delivery"')]

In [34]:
# with sklearn 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [39]:
# compute tokens to tf-idf
tfidfVectorizer = TfidfVectorizer(analyzer= lambda x: x)
tfidf_sklearn = tfidfVectorizer.fit_transform(news.tokens)

In [40]:
pd.DataFrame(data=tfidf_sklearn.toarray(), columns=tfidfVectorizer.get_feature_names_out()).head()

Unnamed: 0,aaron,abandon,abandoning,ability,able,abrupt,abruptly,accept,access,accessed,...,yet,yichuan,york,youre,youtube,youve,zaha,zhong,zooming,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.110189
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.075,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
from gensim.matutils import Sparse2Corpus

In [42]:
# Convert the sklearn sparse matrix to the needed input for Gensim
sparse2Corpus = Sparse2Corpus(sparse=tfidf_sklearn, documents_columns=False)

In [45]:
# LDA v2 compute topics
lda_v2 = LdaModel(corpus= sparse2Corpus, num_topics=5, id2word=word2dict, passes=10)

In [46]:
lda_v2.print_topics()

[(0,
  '0.009*"demonstrators" + 0.005*"roundup" + 0.004*"summertime" + 0.003*"gas" + 0.002*"latest" + 0.002*"app" + 0.002*"serious" + 0.002*"fiction" + 0.002*"components" + 0.002*"value"'),
 (1,
  '0.001*"help" + 0.001*"lucas" + 0.001*"said" + 0.001*"football" + 0.001*"eroded" + 0.001*"mostly" + 0.001*"account" + 0.001*"seriouslyjust" + 0.001*"ramping" + 0.001*"thanks"'),
 (2,
  '0.001*"kidney" + 0.001*"account" + 0.001*"writer" + 0.001*"today" + 0.001*"grayscale" + 0.001*"rented" + 0.001*"personal" + 0.001*"thanks" + 0.001*"pests" + 0.001*"response"'),
 (3,
  '0.002*"sky" + 0.002*"asbestos" + 0.002*"help" + 0.001*"bingbing" + 0.001*"need" + 0.001*"schmeichel" + 0.001*"central" + 0.001*"wan" + 0.001*"presidential" + 0.001*"abrupt"'),
 (4,
  '0.001*"easily" + 0.001*"help" + 0.001*"others" + 0.001*"pennsylvania" + 0.001*"swing" + 0.001*"sketch" + 0.001*"winter" + 0.001*"appeal" + 0.001*"carrier" + 0.001*"ftc"')]

⚠️ Careful : documents are represented in columns in gensim TD-IDF sparse matrix, while documents are represented in rows in sklearn TD-IDF sparse matrix. 

IfTo use sklearn TF-IDF with gensim LDA, you should set documents_columns=False

## LDA visualization

In [47]:
import pyLDAvis
from pyLDAvis import gensim
pyLDAvis.enable_notebook()

In [50]:
viz = gensim.prepare(topic_model=lda_v2, corpus=bow, dictionary=word2dict)

In [51]:
viz

⚠️ with pyLDAvis.gensim.prepare, the corpus is a BOW, not a TF-IDF