# Latent Dirichlet Allocation

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/quora_questions.csv')

In [None]:
df.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


## Preprocessing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [None]:
dtm = cv.fit_transform(df['Question'])

In [None]:
dtm

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 2002912 stored elements and shape (404289, 38669)>

## LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
LDA = LatentDirichletAllocation(n_components=7,random_state=42)

In [None]:
# This can take awhile, we're dealing with a large amount of documents!
LDA.fit(dtm)

In [11]:
len(cv.get_feature_names_out())

38669

In [12]:
import random

In [13]:
for i in range(10):
    random_word_id = random.randint(0, len(cv.get_feature_names_out()) - 1)
    print(cv.get_feature_names_out()[random_word_id])

hydrate
deter
meeting
cryptocurrency
handsome
misspelt
isolation
frequent
brahmacharya
assembly


In [14]:
for i in range(10):
    random_word_id = random.randint(0, len(cv.get_feature_names_out()) - 1)
    print(cv.get_feature_names_out()[random_word_id])

glue
tonic
bayer
mp4
p15
optimizely
mona
adjusted
mosul
granger


### Showing Top Words Per Topic

In [15]:
len(LDA.components_)

7

In [16]:
LDA.components_

array([[ 0.14302751,  0.18117197,  0.14642241, ...,  0.14285724,
         2.14149267,  0.14285724],
       [ 0.14359789, 68.66894325,  0.14285793, ...,  0.14285728,
         0.14285797,  0.14285728],
       [ 5.30622659,  0.15008675,  0.14285791, ...,  2.14285648,
         0.14285796,  2.14285648],
       ...,
       [ 0.14325168,  0.14313396,  0.14285777, ...,  0.14285725,
         0.1428578 ,  0.14285725],
       [ 0.24759557,  0.14293736,  2.13928835, ...,  0.14285723,
         0.14421791,  0.14285723],
       [26.64869167,  0.14315453,  0.14285781, ...,  0.14285726,
         0.14285784,  0.14285726]])

In [17]:
len(LDA.components_[0])

38669

In [18]:
single_topic = LDA.components_[0]

In [19]:
# Returns the indices that would sort this array.
single_topic.argsort()

array([ 6365, 34210, 23296, ..., 26057, 17507,  4632])

In [20]:
# Word least representative of this topic
single_topic[18302]

np.float64(0.14285731762694745)

In [21]:
# Word most representative of this topic
single_topic[single_topic.argsort()[-1]]

np.float64(11757.452048619132)

In [22]:
# Top 10 words for this topic:
single_topic.argsort()[-10:]

array([15073,  2957,  2675, 12200, 11005, 15060, 36480, 26057, 17507,
        4632])

In [23]:
top_word_indices = single_topic.argsort()[-10:]

In [24]:
for index in top_word_indices:
    print(cv.get_feature_names_out()[index])

google
app
android
engineering
does
good
use
phone
india
best


In [25]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['free', 'company', 'using', 'mobile', 'software', 'google', 'app', 'android', 'engineering', 'does', 'good', 'use', 'phone', 'india', 'best']


THE TOP 15 WORDS FOR TOPIC #1
['ways', 'programming', 'stop', 'language', 'improve', '1000', 'notes', 'online', '500', 'english', 'make', 'way', 'learn', 'money', 'best']


THE TOP 15 WORDS FOR TOPIC #2
['safe', 'book', 'water', 'did', 'compare', 'travel', 'average', 'energy', 'india', 'books', 'best', 'good', 'time', 'does', 'life']


THE TOP 15 WORDS FOR TOPIC #3
['make', 'movies', 'thing', 'does', 'question', 'old', 'movie', 'year', 'things', 'questions', 'best', 'know', 'new', 'people', 'quora']


THE TOP 15 WORDS FOR TOPIC #4
['country', 'email', 'differences', 'rid', 'password', 'car', 'number', 'instagram', 'increase', 'india', 'job', 'does', 'difference', 'facebook', 'account']


THE TOP 15 WORDS FOR TOPIC #5
['girl', 'sex', 'long', 'donald', 'india', 'feel', 'love', 'think', 'mean', 'people', 'world', 'tr

**Attaching Discovered Question Labels to Original Question**

In [26]:
dtm

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 2002912 stored elements and shape (404289, 38669)>

In [27]:
dtm.shape

(404289, 38669)

In [28]:
len(df)

404289

In [29]:
topic_results = LDA.transform(dtm)

In [30]:
topic_results.shape

(404289, 7)

In [31]:
topic_results[0]

array([0.01789911, 0.31645888, 0.59414638, 0.01786013, 0.0178879 ,
       0.01787348, 0.01787413])

In [32]:
topic_results[0].round(2)

array([0.02, 0.32, 0.59, 0.02, 0.02, 0.02, 0.02])

In [33]:
topic_results[0].argmax()

np.int64(2)

**This means that our model thinks that the first question belongs to topic #2.**

### Combining with Original Data

In [34]:
df.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [35]:
topic_results.argmax(axis=1)

array([2, 4, 0, ..., 0, 6, 5])

In [36]:
df['Question'] = topic_results.argmax(axis=1)

In [37]:
df.head(10)

Unnamed: 0,Question
0,2
1,4
2,0
3,5
4,2
5,2
6,4
7,2
8,0
9,0
