In [None]:
!pip install gensim nltk spacy pyLDAvis
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import nltk
nltk.download('all')  # This ensures nothing is missing

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [None]:
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

# Load the dataset
data = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
documents = data.data[:2000]  # Sample size

# Preprocessing
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 3]
    return tokens

processed_docs = [preprocess(doc) for doc in documents]

In [None]:
from gensim import corpora

# Create a dictionary and corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
from gensim.models import LdaModel

# Train LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=5,  # You can experiment with this
                     random_state=42,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

In [None]:
# Display topics
topics = lda_model.print_topics(num_words=10)
for i, topic in topics:
    print(f"Topic #{i}: {topic}")

Topic #0: 0.006*"people" + 0.004*"would" + 0.004*"armenian" + 0.003*"turkish" + 0.003*"government" + 0.003*"dont" + 0.002*"armenians" + 0.002*"genocide" + 0.002*"health" + 0.002*"think"
Topic #1: 0.007*"maxaxaxaxaxaxaxaxaxaxaxaxaxaxax" + 0.006*"would" + 0.004*"people" + 0.004*"know" + 0.004*"jesus" + 0.003*"said" + 0.003*"dont" + 0.003*"like" + 0.003*"also" + 0.003*"good"
Topic #2: 0.006*"file" + 0.004*"program" + 0.003*"output" + 0.003*"oname" + 0.002*"first" + 0.002*"period" + 0.002*"char" + 0.002*"windows" + 0.002*"like" + 0.002*"entry"
Topic #3: 0.005*"available" + 0.005*"version" + 0.004*"also" + 0.003*"software" + 0.002*"contact" + 0.002*"machines" + 0.002*"motif" + 0.002*"type" + 0.002*"widget" + 0.002*"dont"
Topic #4: 0.009*"would" + 0.005*"like" + 0.005*"dont" + 0.005*"know" + 0.004*"people" + 0.004*"think" + 0.004*"could" + 0.004*"much" + 0.004*"well" + 0.003*"time"


In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

# Prepare visualization
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)