## Conclusion
If the tfidf value of the top most salient terms in an LDA model aren't found or are below a threshold in a new Document, retrain the LDA model with the new document.

Dataset Used: [Bishop Dataset](https://www.kaggle.com/datasets/daishinkan002/bishop-topic-modelling-dataset)

In [None]:
%pip install contractions pyLDAvis "pandas<2.0.0"

In [2]:
import pandas as pd
import numpy as np

In [3]:
bishop = pd.read_csv("Bishop.csv", index_col=0)
bishop["text"][0]

' problem searching patterns data fundamental one long successful history instance extensive astronomical observations tycho brahe century allowed johannes kepler discover empirical laws planetary motion turn provided springboard development classical mechanics similarly discovery regularities atomic spectra played key role development veriﬁcation quantum physics early twentieth century ﬁeld pattern recognition concerned automatic discovariance ery regularities data use computer algorithms use regularities take actions classifying data different categories consider example recognizing handwritten digits illustrated figure digit corresponds pixel image represented vector comprising real numbers goal build machine take vector input produce identity digit output nontrivial problem due wide variability handwriting could introduction figure examples hand written dig taken zip codes tackled using handcrafted rules heuristics distinguishing digits based shapes strokes practice approach leads 

In [None]:
%pip install nltk

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('subject')
stop_words.add('http')

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

import contractions

import re
import string

In [6]:
def preprocess(text):
    text = re.sub('[%s]' % re.escape(string.punctuation.replace('+', '')), '' , text)

    text = text.lower()
    text = nltk.word_tokenize(text)

    expanded_cont = []
    for i in text:
        expanded_cont.append(contractions.fix(i))
    text = expanded_cont

    text = [word for word in text if word not in stop_words]
    text = [lemmatizer.lemmatize(word) for word in text]
    return text

In [7]:
from pprint import pprint

pprint(preprocess(bishop["text"][0])[:10])

['problem',
 'searching',
 'pattern',
 'data',
 'fundamental',
 'one',
 'long',
 'successful',
 'history',
 'instance']


In [8]:
bishop['tokens'] = bishop['text'].apply(lambda x : preprocess(x))
bishop.head()

Unnamed: 0,topic,text,tokens
0,[introduction],problem searching patterns data fundamental o...,"[problem, searching, pattern, data, fundamenta..."
1,"[example, polynomial, curve, fitting]",begin introducing simple regression problem s...,"[begin, introducing, simple, regression, probl..."
2,"[probability, theory]",key concept ﬁeld pattern recognition uncertai...,"[key, concept, ﬁeld, pattern, recognition, unc..."
3,"[probability, theory, probability, densities]",well considering probabilities deﬁned discret...,"[well, considering, probability, deﬁned, discr..."
4,"[probability, theory, expectations, covariances]",one important operations involving probabilit...,"[one, important, operation, involving, probabi..."


In [9]:
data_words = list(bishop.tokens)

In [10]:
import gensim
import gensim.corpora as corpora

from gensim.models import TfidfModel
from gensim.models import Phrases, phrases

In [11]:
def make_phrases(tokens: list[str]):
    bigram_ph = Phrases(data_words, min_count=5, threshold=50)
    trigram_ph = Phrases(bigram_ph[data_words], threshold=50)

    bigram = phrases.Phraser(bigram_ph)
    trigram = phrases.Phraser(trigram_ph)

    data_bigrams = [bigram[doc] for doc in tokens]
    data_trigrams = [trigram[bigram[doc]] for doc in tokens]
    return data_trigrams

def train_tfidf(data: list[list[str]]):
    id2word = corpora.Dictionary(data)
    corpus = [id2word.doc2bow(word) for word in data]

    tfidfModel = TfidfModel(corpus, id2word = id2word)
    return tfidfModel

def tfidf_filter(data: list[list[str]], model: TfidfModel, id2word = None):
    if id2word == None:
        id2word = corpora.Dictionary(data)
    corpus = [id2word.doc2bow(word) for word in data]

    low_val = 0.03
    low_val_words = []
    for bow in corpus:
        low_val_words += [id for id, value in model[bow] if value < low_val]

    id2word.filter_tokens(bad_ids=low_val_words)

    return ([id2word.doc2bow(doc) for doc in data], id2word)

In [12]:
data_words = make_phrases(data_words)
tfidf_model = train_tfidf(data_words)
data_corpus, id2word = tfidf_filter(data_words, tfidf_model)
print(id2word)
print(data_corpus[0][:50])

Dictionary<2352 unique tokens: ['ability', 'affect', 'astronomical', 'atomic', 'board']...>
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 4), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 2), (33, 4), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1)]


In [13]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=data_corpus,
                                            id2word=id2word,
                                            num_topics=10,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto')

In [14]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [None]:
pyLDAvis.enable_notebook()

vis = gensimvis.prepare(lda_model, data_corpus, id2word, mds='mmds', R=20)

In [16]:
vis

In [17]:
# Next step: Copy a msdn or msvc doc webpage and pass it through all the steps. And see the plot for it
with open("ms-text.md", "r") as f:
    ms_text = f.read()
    f.close()

ms_text = preprocess(ms_text)

print(ms_text)

['title', 'inside', 'native', 'application', 'msdate', '11012006', 'mstopic', 'conceptual', 'msprod', 'windowssysinternals', 'mstechnology', 'systemutilities', 'description', 'article', 'i am', 'going', 'describe', 'native', 'application', 'built', 'work', 'inside', 'native', 'application', 'mark', 'russinovich', 'published', 'november', '1', '2006', 'introduction', 'familiarity', 'nt', 'architecture', 'probably', 'aware', 'api', 'win32', 'application', 'use', 'is not', 'real', 'nt', 'api', 'nt', 'operating', 'environment', 'include', 'posix', 'os2', 'win32', 'talk', 'client', 'application', 'via', 'apis', 'talk', 'nt', 'using', 'nt', 'native', 'api', 'native', 'api', 'mostly', 'undocumented', '25', '250', 'function', 'described', 'window', 'nt', 'device', 'driver', 'kit', 'people', 'do not', 'know', 'however', 'native', 'application', 'exist', 'nt', 'client', 'operating', 'environment', 'program', 'speak', 'native', 'nt', 'api', 'cannot', 'use', 'operating', 'environment', 'apis', 'li

In [None]:
ms_text = make_phrases([ms_text])
print(ms_text)
id2word.add_documents(ms_text)
ms_corpus = [id2word.doc2bow(text) for text in ms_text]
print(ms_corpus)
print(id2word)

lda_model = gensim.models.ldamodel.LdaModel(corpus=data_corpus+ms_corpus,
                                            id2word=id2word,
                                            num_topics=10,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto')

ms_lda = lda_model[ms_corpus[0]]
print(ms_lda)

vis = gensimvis.prepare(lda_model, ms_corpus, id2word, mds='mmds', R=20)

In [19]:
vis