### 7.2.1 Implementation of LDA

In [120]:
corpus = ['bread bread bread bread bread bread bread bread bread bread',
          'milk milk milk milk milk milk milk milk milk milk',
          'pet pet pet pet pet pet pet pet pet pet',
          'bread bread bread bread bread bread bread bread bread bread milk milk milk milk milk milk milk milk milk milk']

In [121]:
from sklearn.feature_extraction.text import CountVectorizer

In [122]:
vec = CountVectorizer()

In [123]:
matrix_x = vec.fit_transform(corpus)
matrix_x

<4x3 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [124]:
matrix_x.toarray()

array([[10,  0,  0],
       [ 0, 10,  0],
       [ 0,  0, 10],
       [10, 10,  0]])

In [125]:
from sklearn.decomposition import LatentDirichletAllocation

In [126]:
# n_components : number of topics
lda = LatentDirichletAllocation(n_components = 2)

In [127]:
lda.fit(matrix_x)

LatentDirichletAllocation(n_components=2)

In [128]:
# components that LDA has extracted basically the topics
# array[0] : 1st topic, array[1] : 2nd topic
# values represent relevance of each word with each topic
# soft clustering as all the words belong to all the topics (because values are not 0)
# values are probs, not normalized
lda.components_

array([[10.45052273, 20.49672838,  0.50135735],
       [10.54947727,  0.50327162, 10.49864265]])

In [129]:
vec.vocabulary_

{'bread': 0, 'milk': 1, 'pet': 2}

In [130]:
features = vec.get_feature_names()

In [131]:
# these words in a topic have strong corelation with eachother

for topic_id, topic in enumerate(lda.components_):
    print('topic:',topic_id)
    
    # to see word IDs in each topic
#     print('word IDs:',topic.argsort()[::-1])
#     print('words:',[features[i] for i in topic.argsort()])
#     print('prob:',[topic[i] for i in topic.argsort()])

    # to get probs in descending order
    print('word IDs:',topic.argsort()[::-1])
    print('words:',[features[i] for i in topic.argsort()[::-1]])
    print('prob:',[topic[i] for i in topic.argsort()[::-1]])
    print('----------')
    

topic: 0
word IDs: [1 0 2]
words: ['milk', 'bread', 'pet']
prob: [20.49672838354807, 10.450522734138774, 0.5013573459416129]
----------
topic: 1
word IDs: [0 2 1]
words: ['bread', 'pet', 'milk']
prob: [10.549477265861215, 10.498642654058381, 0.5032716164519231]
----------


### 7.2.2 Practical w/ topic modeling  on UCI repo

In [132]:
corpus = open('dataset2.csv', encoding='Latin-1', errors='ignore').read()

In [133]:
docs = corpus.split('\n')

In [134]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [135]:
vec = TfidfVectorizer()

In [136]:
matrix_x = vec.fit_transform(docs)

In [137]:
from sklearn.decomposition import LatentDirichletAllocation

In [138]:
# n_components = 6 : to soft cluster 2457 features^ in 6 clusters
lda = LatentDirichletAllocation(n_components = 6)

In [139]:
lda.fit(matrix_x)

LatentDirichletAllocation(n_components=6)

In [140]:
lda.components_

array([[0.16667054, 0.16667115, 1.43506932, ..., 0.16666861, 0.16667798,
        0.16666862],
       [0.38845029, 0.16667161, 0.16667374, ..., 0.16666878, 0.90893044,
        0.16666883],
       [0.16666894, 0.73995469, 0.16998067, ..., 0.6662572 , 0.17834019,
        0.16666782],
       [0.16667293, 0.25304891, 0.16667721, ..., 0.16666984, 0.16668512,
        0.16666989],
       [0.16667193, 0.81513356, 0.16754553, ..., 0.16716248, 0.418654  ,
        0.16666943],
       [0.48677208, 0.16667243, 0.45149646, ..., 0.16666909, 0.16668098,
        0.59446138]])

In [141]:
features = vec.get_feature_names()

In [142]:
for topic_id, topic in enumerate(lda.components_):
    print('topic:',topic_id)
    print('word IDs:',topic.argsort()[:-10:-1])
    print('word:',[features[i] for i in topic.argsort()[:-10:-1]])
    print('prob:',[topic[i] for i in topic.argsort()[:-10:-1]])
    print('-----------')

topic: 0
word IDs: [2450 2366 2212 2176  183  418 2402  108 2180]
word: ['you', 'we', 'to', 'thank', 'back', 'come', 'will', 'and', 'the']
prob: [14.054962674997867, 11.03380166915811, 9.840805811325547, 9.216784656611988, 9.015972235779595, 8.623706022654037, 7.900634288573023, 7.363489555331691, 6.353311429442956]
-----------
topic: 1
word IDs: [2180  108 2327 1471 2422 2414 2450 1176 2212]
word: ['the', 'and', 'very', 'nice', 'wonderful', 'with', 'you', 'is', 'to']
prob: [7.133862391863483, 6.30406582807773, 6.201338192644753, 4.831321568984839, 4.649677338075756, 4.560977246144773, 4.366533946019295, 4.238494131511153, 4.212166337229894]
-----------
topic: 2
word IDs: [2180  108 2366 2212  894 1176 1512 2450 2355]
word: ['the', 'and', 'we', 'to', 'for', 'is', 'of', 'you', 'was']
prob: [45.62949719712179, 26.541864876530042, 22.791322876233355, 20.56242191726232, 17.9615040901952, 16.431664798169834, 15.113422665286507, 14.914866652927355, 14.530533336895674]
-----------
topic: 3
wo

### 7.2.3 Implementing LDA w/ different hyper-parameters

In [158]:
corpus = ['bread bread bread bread bread bread bread bread bread bread',
          'milk milk milk milk milk milk milk milk milk milk',
          'pet pet pet pet pet pet pet pet pet pet',
          'bread bread bread bread bread bread bread bread bread bread milk milk milk milk milk milk milk milk milk milk']

In [159]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
matrix_x = vec.fit_transform(corpus)

In [191]:
from sklearn.decomposition import LatentDirichletAllocation

# doc_topic_prior : alpha, topic_word_prior : beta
# if we decrease topic_word_prior value then the difference b/w the highest prob_val and lowest prob_val in a topic increases
# if we increase the topic_prior_value then the difference b/w highest prob_val and lowest prob_val in a topic decreases

# if we increase doc_topic_prior then the different b/w values in transform will decrease i.e. supporting more topics with similar probs to represent a topic
# if we decrease doc_topic_prior then the difference b/w values in transform will increase i.e. supporting fewer topics within the document, docs represented by few no. of topics with higher probs
lda = LatentDirichletAllocation(n_components = 2, topic_word_prior = 1.0, doc_topic_prior=0.01)
lda.fit(matrix_x)

LatentDirichletAllocation(doc_topic_prior=0.01, n_components=2,
                          topic_word_prior=1.0)

In [177]:
lda.components_

array([[ 8.4532381 , 16.36454154,  3.56617907],
       [13.5467619 ,  5.63545846,  8.43382093]])

In [178]:
for topic in lda.components_:
    print([topic[t] for t in topic.argsort()[::-1]])

[16.36454154468869, 8.453238103397524, 3.5661790723599482]
[13.546761896602463, 8.433820927640042, 5.635458455311296]


In [192]:
# to see documents topic distribution
# higher value in a doc represents the topic that has more influence in it
lda.transform(matrix_x)

array([[9.99001996e-01, 9.98003992e-04],
       [9.98003992e-04, 9.99001996e-01],
       [9.99001996e-01, 9.98003992e-04],
       [4.99500500e-04, 9.99500500e-01]])

### 7.2.4 Online LDA with UCI Repo Dtaset

In [280]:
corpus = open('dataset2.csv', encoding='Latin-1', errors='ignore').read()

In [281]:
docs = corpus.split('\n')

In [282]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
matrix_x = vec.fit_transform(docs)

In [286]:
from sklearn.decomposition import LatentDirichletAllocation

# we want our model to learn incrementally when data is too big so 'online'
# learning_offset: to keep in check how much we want our model to modify after receiving another batch, val should be +ve
lda = LatentDirichletAllocation(n_components = 2, max_iter = 200, learning_method='online', learning_offset=4)

In [200]:
# partial_fit : the model will retain info what it learn in each iteration and doesn't retrain model in each iteration
lda.partial_fit(matrix_x)

LatentDirichletAllocation(learning_method='online', max_iter=200,
                          n_components=2)

In [201]:
lda.components_

array([[ 2.18206652,  6.32160862,  1.16951975, ...,  0.50322718,
         0.74336439,  1.33158447],
       [ 0.52428204,  0.56787265,  3.21961333, ...,  3.06230048,
        10.45770662,  0.52158981]])

In [284]:
# we want to provide batch of data to partial_fit so dividing the data
step = matrix_x.shape[0]/10
step = int(step)
step

65

In [287]:
index = 0
for i in range(10):
    if i == 9:
        lda.partial_fit(matrix_x[index:])
    lda.partial_fit(matrix_x[index:index+step])
    index = index + step
    
    print('\niteration:',i)
    print(lda.components_)


iteration: 0
[[0.93031017 0.95506432 0.74711555 ... 0.69658036 0.82972445 0.78840153]
 [0.79832566 0.87657044 0.85943474 ... 0.8314769  0.83904556 0.77254225]]

iteration: 1
[[8.07544843e-01 8.25236758e-01 4.18241698e+03 ... 6.40496972e-01
  7.35655719e-01 7.06122021e-01]
 [7.13214845e-01 7.69136786e-01 2.08169996e+02 ... 7.36908203e-01
  7.42317566e-01 6.94787318e-01]]

iteration: 2
[[7.28778656e-01 7.41939444e-01 3.11137430e+03 ... 6.04513892e-01
  6.75301261e-01 6.53331523e-01]
 [6.58607783e-01 7.00207396e-01 1.54983041e+02 ... 6.76232967e-01
  6.80256923e-01 6.44899782e-01]]

iteration: 3
[[6.75414148e-01 2.13733565e+04 2.38573721e+03 ... 1.04548280e+03
  4.67939885e+03 6.17565680e-01]
 [6.21611209e-01 3.74810135e+03 1.18948597e+02 ... 2.54432086e+03
  2.49905058e+03 6.11100713e-01]]

iteration: 4
[[6.37735539e-01 1.67825096e+04 5.11642849e+03 ... 8.21022582e+02
  3.67438071e+03 5.92312807e-01]
 [5.95489364e-01 2.94312406e+03 1.55055287e+02 ... 1.99791321e+03
  1.96236690e+03 5.87

### 7.2.5 Perplexity

In [288]:
corpus = open('dataset2.csv', encoding='Latin-1', errors='ignore').read()

In [289]:
docs = corpus.split('\n')

In [292]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
matrix_x = vec.fit_transform(docs)
matrix_x.shape

(653, 2457)

In [300]:
from sklearn.decomposition import LatentDirichletAllocation
lda1 = LatentDirichletAllocation(n_components = 2)
lda2 = LatentDirichletAllocation(n_components = 3)
lda3 = LatentDirichletAllocation(n_components = 4)
lda4 = LatentDirichletAllocation(n_components = 5)

In [304]:
lda1.fit(matrix_x[:500])
lda2.fit(matrix_x[:500])
lda3.fit(matrix_x[:500])
lda4.fit(matrix_x[:500])

LatentDirichletAllocation(n_components=5)

In [305]:
# components = 2 gave us more compact topics as compared to components = 3 cuz val less
# we dont know what n_components is good value so we train models on different n_components and select the one which has the least val of perpelexity i.e. closer to 0
print(lda1.perplexity(matrix_x[500:]))
print(lda2.perplexity(matrix_x[500:]))
print(lda3.perplexity(matrix_x[500:]))
print(lda4.perplexity(matrix_x[500:]))

1215.75640830575
1648.015356114946
2587.477503265197
2969.2546860247135


### Assignment

In [342]:
corpus = open('dataset2.csv', encoding='Latin-1', errors='ignore').read()
docs = corpus.split('\n')

In [343]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
matrix_x = tfidf.fit_transform(docs)

In [346]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components = 20)
lda.fit(matrix_x)

LatentDirichletAllocation(n_components=20)

In [347]:
features = tfidf.get_feature_names()

for topic_id, topic in enumerate(lda.components_):
    print('Topic ID:',topic_id)
    print('Words:', [features[i] for i in topic.argsort()[:-6:-1]])
    print('Probabilities:', [topic[i] for i in topic.argsort()[:-6:-1]])
    print('--------')

Topic ID: 0
Words: ['love', 'as', 'forward', 'great', 'pass']
Probabilities: [3.7069996676180694, 1.2542498252912746, 0.8499686414956816, 0.6498166177385032, 0.6146883462193375]
--------
Topic ID: 1
Words: ['as', 'company', 'none', 'tasting', 'evening']
Probabilities: [0.8809846092557523, 0.6785104994300863, 0.5843270889203412, 0.5730756805287434, 0.5723567865163756]
--------
Topic ID: 2
Words: ['nearby', 'said', 'carry', 'beach', 'run']
Probabilities: [0.8165578198914392, 0.6921319465491572, 0.6139116082599693, 0.5685004475804897, 0.5676947056428434]
--------
Topic ID: 3
Words: ['recommend', 'second', 'heaven', 'still', 'cool']
Probabilities: [1.1994615665817536, 1.0242910913875023, 0.9065667159460022, 0.8720400544027536, 0.8652566497105554]
--------
Topic ID: 4
Words: ['having', '1000', 'sublime', 'leonardo', 'ola']
Probabilities: [1.7735851965956897, 1.0949201660151646, 0.8892188579701752, 0.8769228422457807, 0.8750732075214527]
--------
Topic ID: 5
Words: ['feedback', 'if', 'far', 