In [2]:

# Cell 2: Imports and setup
import gensim
from gensim import corpora, models
from gensim.parsing.preprocessing import preprocess_string
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Cell 3: Sample expanded documents
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV: Widths of trees and well quasi ordering",
    "Graph minors: A survey",
    "User feedback on software system usability and efficiency",
    "Advanced machine learning techniques in software engineering",
    "Artificial intelligence and its application in modern systems",
    "Deep learning models outperform traditional models in NLP",
    "Topic modeling and natural language processing for text data",
    "Unsupervised learning methods in large document collections",
    "Computer graphics and rendering pipelines",
    "Neural networks and computer vision",
    "Big data processing using Spark and Hadoop",
    "Applications of LDA in social media analysis",
    "Building scalable REST APIs with Django and Flask"
]

# Cell 4: Plot distribution of token counts per document
token_counts = [len(word_tokenize(doc)) for doc in documents]

plt.figure(figsize=(10, 5))
plt.hist(token_counts, bins=10, color='skyblue', edgecolor='black')
plt.title('Token Count Distribution in Documents')
plt.xlabel('Number of Tokens')
plt.ylabel('Number of Documents')
plt.grid(True)
plt.show()





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\enala_culjkpz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\enala_culjkpz/nltk_data'
    - 'd:\\GITHUB\\Latent dirichlet\\myenv\\nltk_data'
    - 'd:\\GITHUB\\Latent dirichlet\\myenv\\share\\nltk_data'
    - 'd:\\GITHUB\\Latent dirichlet\\myenv\\lib\\nltk_data'
    - 'C:\\Users\\enala_culjkpz\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
# Cell 5: Preprocess documents for LDA
texts = [preprocess_string(doc) for doc in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Cell 6: Train LDA model
lda = models.LdaModel(
    corpus, 
    num_topics=5, 
    id2word=dictionary, 
    passes=20, 
    random_state=42
)

In [None]:
# Print the topics
for idx, topic in lda.print_topics(-1):
    print(f"Topic {idx}: {topic}")




In [None]:
# Cell 7: Visualize with pyLDAvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda, corpus, dictionary)
vis


In [None]:

# Cell 8: Save model and dictionary for Django REST API use
lda.save("lda_model.gensim")
dictionary.save("dictionary.dict")