# LDA - Scratch implementation with token filtering

Here, I'm using LDA(Scratch implementation) model with token filtering. 

# Importing libraries

In [1]:
from pprint import pprint
import pandas as pd
from nltk import word_tokenize

import gensim
from gensim import corpora, models

# Global Variables

If you're in local machine, you should run this cell below:

In [2]:
BASE_PATH = "./"

If you're in Google Colab, you should run this cell below:

In [3]:
# BASE_PATH = "<ENTER YOUR DRIVE PATH>"

# Load Data
Preprocessed training and testing data from 
[20-news-dataset-pre-processing](https://github.com/nimmitahsin1727/20-news-dataset-pre-processing)

Reading TRAINING from CSV:

In [4]:
training_df = pd.read_csv(f'{BASE_PATH}training_df.csv') 

In [5]:
training_df

Unnamed: 0,data,target
0,jjd jamesjdutton bike big dog line apr cuaedu ...,rec.motorcycles
1,remmons robert emmons mail order articleid iat...,rec.motorcycles
2,nrmendel nathaniel mendell opinion want help a...,rec.motorcycles
3,ramarren applecom godfrey digiorgi der whassa ...,rec.motorcycles
4,jburnside llmitedu jamie burnside bike want ad...,rec.motorcycles
...,...,...
1723,veal utkvmutkedu david veal amendment dead goo...,talk.politics.guns
1724,betz andrew betz randy weaver trial update day...,talk.politics.guns
1725,jrutledg csulowelledu john lawrence rutledge g...,talk.politics.guns
1726,washington state kim john kim harvard science ...,talk.politics.guns


Reading TESTING from CSV:

In [6]:
testing_df = pd.read_csv(f'{BASE_PATH}testing_df.csv') 

In [7]:
testing_df

Unnamed: 0,data,target
0,chrispi microsoftcom chris pirih bike microsof...,rec.motorcycles
1,mbeaving bnrca beav opinion mean bmerh replyto...,rec.motorcycles
2,arturo informixcom arturo vega bonding caged c...,rec.motorcycles
3,mrr pocwruedu mark rabne insure cbrf case west...,rec.motorcycles
4,wayneorwig wayne orwig battery storage charge ...,rec.motorcycles
...,...,...
1146,alane microsoftcom alan ezekiel waco burn micr...,talk.politics.guns
1147,roby scott roby photographer remove compound d...,talk.politics.guns
1148,stevek cellarorg steve kraisler atf burn divid...,talk.politics.guns
1149,rscharfy ryan scharfy atf burn dividian ranch ...,talk.politics.guns


Create data_words with training data

In [8]:
data_words = training_df.data.map(lambda doc: word_tokenize(doc)).values.tolist()

In [9]:
data_words

[['jjd',
  'jamesjdutton',
  'bike',
  'big',
  'dog',
  'line',
  'apr',
  'cuaedu',
  'wendel',
  'cuaedu',
  'write',
  'heard',
  'rider',
  'big',
  'dog',
  'great',
  'dane',
  'ride',
  'bike',
  'dog',
  'love',
  'make',
  'work',
  'thanks',
  'wendel',
  'cuaedu',
  'large',
  'malmute',
  'count',
  'yes',
  'hear',
  'childish',
  'stunt',
  'dog',
  'need',
  'assistance',
  'straighten',
  'board',
  'owner',
  'lift',
  'leg',
  'dog',
  'throw',
  'driverpilots',
  'shoulder',
  'say',
  'dog',
  'shit',
  'eat',
  'grin',
  'face',
  'away',
  'dog',
  'firmly',
  'plant',
  'seat',
  'dog',
  'dog',
  'actively',
  'seek',
  'camp',
  'party',
  'hate',
  'personal',
  'steatopygia',
  'doh',
  'hottentot',
  'ama',
  'dod'],
 ['remmons',
  'robert',
  'emmons',
  'mail',
  'order',
  'articleid',
  'iatcjafvx',
  'holonet',
  'national',
  'internet',
  'access',
  'modem',
  'line',
  'good',
  'service',
  'shop',
  'regularly',
  'merchandise',
  'stock',
  'nee

# Corpus Creation

**Bag of Words on the Data set**

Create a dictionary from `data_words` containing the number of times a word appears in the training set.

In [10]:
dictionary = corpora.Dictionary(data_words)

In [11]:
print("Total words: ", len(dictionary.iteritems()))

Total words:  22094


Printing some samples from dictionary:

In [12]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 actively
1 ama
2 apr
3 assistance
4 away
5 big
6 bike
7 board
8 camp
9 childish
10 count


**Gensim filter_extremes**

***Filter out tokens that appear in***

less than 15 documents (absolute number) or
more than 0.5 documents (fraction of total corpus size, not absolute number).

In [13]:
dictionary.filter_extremes(no_below=15, no_above=0.5)

In [14]:
print("Total words after filter: ", len(dictionary.iteritems()))

Total words after filter:  1832


**Gensim doc2bow**

For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [15]:
bow_corpus = [dictionary.doc2bow(doc) for doc in data_words]

In [16]:
bow_corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 2),
  (4, 2),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 9),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1)],
 [(20, 2),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 2),
  (40, 2),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 2),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 2),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 2),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 2),
  (65, 1),
  (66, 2),
  (67, 1),
  (68, 2),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 2),
  (73, 1),
  (74, 2)],
 [(0, 1),
  (4, 2),
  (7, 1),
  (24, 2),
  (51, 2),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),

Preview Bag Of Words for our sample preprocessed document.



In [17]:
bow_doc_0 = bow_corpus[0]
bow_doc_0

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 2),
 (4, 2),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 9),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1)]

In [18]:
for i in range(len(bow_doc_0)):
    print(f'Word {bow_doc_0[i][0]} (\"{dictionary[bow_doc_0[i][0]]}\") appears {bow_doc_0[i][1]} time.')

Word 0 ("ama") appears 1 time.
Word 1 ("apr") appears 1 time.
Word 2 ("away") appears 1 time.
Word 3 ("big") appears 2 time.
Word 4 ("bike") appears 2 time.
Word 5 ("board") appears 1 time.
Word 6 ("count") appears 1 time.
Word 7 ("dod") appears 1 time.
Word 8 ("dog") appears 9 time.
Word 9 ("eat") appears 1 time.
Word 10 ("face") appears 1 time.
Word 11 ("great") appears 1 time.
Word 12 ("hate") appears 1 time.
Word 13 ("hear") appears 1 time.
Word 14 ("heard") appears 1 time.
Word 15 ("large") appears 1 time.
Word 16 ("leg") appears 1 time.
Word 17 ("lift") appears 1 time.
Word 18 ("love") appears 1 time.
Word 19 ("make") appears 1 time.
Word 20 ("need") appears 1 time.
Word 21 ("owner") appears 1 time.
Word 22 ("party") appears 1 time.
Word 23 ("personal") appears 1 time.
Word 24 ("ride") appears 1 time.
Word 25 ("rider") appears 1 time.
Word 26 ("say") appears 1 time.
Word 27 ("seat") appears 1 time.
Word 28 ("seek") appears 1 time.
Word 29 ("shit") appears 1 time.
Word 30 ("should

# LDA

**Running LDA - Scratch**

In [19]:
import numpy as np
from lda_vb import vbLDA

In [20]:
n_topic = 10
max_iter=100

voca = [v for k, v in dictionary.iteritems()]

n_doc = len(data_words)
n_voca = len(dictionary.iteritems())

print(n_doc, n_voca)

1728 1832


In [21]:
doc_ids = [list(map(lambda bow: bow[0], bow_doc)) for bow_doc in bow_corpus]
doc_cnt = [list(map(lambda bow: bow[1], bow_doc)) for bow_doc in bow_corpus]

In [22]:
lda_vb_model = vbLDA(n_doc, n_voca, n_topic)

Model fitting

In [23]:
lda_vb_model.fit(doc_ids, doc_cnt, max_iter=max_iter)

2023-02-17 01:27:32 INFO:vbLDA:[ITER] 0,	elapsed time:2.67,	ELBO:-1060881.24
2023-02-17 01:27:35 INFO:vbLDA:[ITER] 1,	elapsed time:2.81,	ELBO:-1053123.71
2023-02-17 01:27:37 INFO:vbLDA:[ITER] 2,	elapsed time:2.35,	ELBO:-1028230.15
2023-02-17 01:27:39 INFO:vbLDA:[ITER] 3,	elapsed time:1.91,	ELBO:-987547.17
2023-02-17 01:27:41 INFO:vbLDA:[ITER] 4,	elapsed time:1.76,	ELBO:-960218.93
2023-02-17 01:27:43 INFO:vbLDA:[ITER] 5,	elapsed time:1.77,	ELBO:-946460.74
2023-02-17 01:27:44 INFO:vbLDA:[ITER] 6,	elapsed time:1.76,	ELBO:-939387.35
2023-02-17 01:27:46 INFO:vbLDA:[ITER] 7,	elapsed time:1.67,	ELBO:-935544.53
2023-02-17 01:27:48 INFO:vbLDA:[ITER] 8,	elapsed time:1.64,	ELBO:-933390.78
2023-02-17 01:27:49 INFO:vbLDA:[ITER] 9,	elapsed time:1.70,	ELBO:-932076.17
2023-02-17 01:27:51 INFO:vbLDA:[ITER] 10,	elapsed time:1.81,	ELBO:-931154.77
2023-02-17 01:27:53 INFO:vbLDA:[ITER] 11,	elapsed time:1.70,	ELBO:-930480.05
2023-02-17 01:27:54 INFO:vbLDA:[ITER] 12,	elapsed time:1.64,	ELBO:-929967.62
2023-0

Print the Keyword in the 10 topics

In [24]:
def get_top_words(topic_word_matrix, vocab, topic, n_words=20):
    if not isinstance(vocab, np.ndarray):
        vocab = np.array(vocab)
    top_words = vocab[topic_word_matrix[topic].argsort()[::-1][:n_words]]
    return top_words

In [25]:
for ti in range(n_topic):
    top_words = get_top_words(lda_vb_model._lambda, voca, ti, n_words=10)
    print('Topic', ti ,': ', ','.join(top_words))

Topic 0 :  image,file,use,jpeg,program,color,format,bit,display,gif
Topic 1 :  bike,dod,buy,make,new,bmw,ride,year,like,know
Topic 2 :  file,graphic,use,computer,point,email,need,information,polygon,look
Topic 3 :  gun,state,weapon,firearm,right,use,law,militia,file,control
Topic 4 :  bike,dod,helmet,apr,behanna,like,need,say,make,just
Topic 5 :  graphic,know,just,program,use,like,help,good,group,thanks
Topic 6 :  gun,crime,rate,criminal,homicide,kill,point,death,handgun,control
Topic 7 :  say,fbi,child,compound,make,batf,come,start,roby,day
Topic 8 :  make,cdt,right,good,time,use,like,public,country,say
Topic 9 :  bike,like,just,time,dog,good,right,make,turn,know


**Running LDA - GENSIM**

Train our lda model using gensim.models.LdaMulticore and save it to lda_model

In [26]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=n_topic, id2word=dictionary, random_state=42)

# lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=n_topic, id2word=dictionary, random_state=100,  passes=2, workers=2)

Print the Keyword in the 10 topics

In [27]:
pprint(lda_model.print_topics())

[(0,
  '0.017*"image" + 0.011*"file" + 0.010*"use" + 0.010*"bike" + 0.006*"know" + '
  '0.006*"good" + 0.005*"like" + 0.005*"email" + 0.005*"jpeg" + 0.005*"just"'),
 (1,
  '0.012*"gun" + 0.011*"file" + 0.011*"use" + 0.008*"make" + 0.008*"know" + '
  '0.008*"like" + 0.008*"say" + 0.007*"right" + 0.006*"dod" + 0.006*"just"'),
 (2,
  '0.011*"gun" + 0.009*"know" + 0.008*"look" + 0.008*"file" + 0.007*"point" + '
  '0.007*"use" + 0.006*"like" + 0.005*"need" + 0.005*"good" + 0.005*"bike"'),
 (3,
  '0.012*"gun" + 0.011*"image" + 0.009*"like" + 0.009*"use" + 0.008*"say" + '
  '0.006*"make" + 0.006*"time" + 0.005*"right" + 0.005*"file" + 0.005*"just"'),
 (4,
  '0.009*"need" + 0.008*"use" + 0.007*"gun" + 0.007*"state" + 0.007*"like" + '
  '0.006*"dod" + 0.006*"apr" + 0.006*"file" + 0.006*"say" + 0.005*"make"'),
 (5,
  '0.010*"know" + 0.009*"use" + 0.007*"make" + 0.007*"just" + 0.007*"program" '
  '+ 0.007*"like" + 0.007*"graphic" + 0.006*"good" + 0.005*"time" + '
  '0.005*"right"'),
 (6,
  '0.009