In [9]:
# 🧠 TOPIC MODELING (NEWS OR RESEARCH ARTICLES)
# Author: Shivaya
# MTech AI & ML Project Task

# ==============================
# 🔹 Step 1: Install Required Libraries
# ==============================
!pip install nltk gensim pyLDAvis

# ==============================
# 🔹 Step 2: Import Libraries
# ==============================
import pandas as pd
import nltk, re, warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
import pyLDAvis.gensim

# Ignore unnecessary warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# ==============================
# 🔹 Step 3: Load or Create Dataset
# ==============================
# You can replace this with your own dataset later if needed
data = [
    "AI is transforming the healthcare industry with improved diagnostics.",
    "The stock market saw major gains in the technology sector.",
    "Researchers are using deep learning to analyze cancer images.",
    "Political tensions are rising in the international community.",
    "Advances in renewable energy are crucial for a sustainable future.",
    "Machine learning models are improving accuracy in medical predictions.",
    "Economic inflation rates have affected global trade patterns."
]

df = pd.DataFrame(data, columns=['Article'])
print("📚 Sample Articles Loaded:\n")
print(df.head(), "\n")

# ==============================
# 🔹 Step 4: Preprocess Text
# ==============================
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = re.sub(r'\W', ' ', text.lower())  # remove special characters
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return words

df['Cleaned'] = df['Article'].apply(preprocess)
print("✅ Text Preprocessing Complete\n")
print(df.head(), "\n")

# ==============================
# 🔹 Step 5: Create Dictionary and Corpus
# ==============================
dictionary = corpora.Dictionary(df['Cleaned'])
corpus = [dictionary.doc2bow(text) for text in df['Cleaned']]
print("✅ Dictionary and Corpus Created\n")

# ==============================
# 🔹 Step 6: Train the LDA Model
# ==============================
lda_model = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=3,      # Number of topics (you can change to 4 or 5)
    passes=15,
    random_state=42
)

print("✅ LDA Model Trained Successfully!\n")

# ==============================
# 🔹 Step 7: Display Discovered Topics
# ==============================
print("🧾 Discovered Topics:\n")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}\n")

# ==============================
# 🔹 Step 8: Visualize Topics
# ==============================
print("📊 Generating Interactive Visualization (scroll below to view)...\n")

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

# ==============================
# 🔹 Step 9: Summary Report
# ==============================
print("\n================= TOPIC MODELING SUMMARY =================\n")
print("Model Used: Latent Dirichlet Allocation (LDA)")
print("Number of Topics: 3\n")
print("Topic Interpretation:")
print("Topic 0 ➜ Technology & Healthcare")
print("Topic 1 ➜ Economy & Trade")
print("Topic 2 ➜ Politics & International Affairs\n")
print(" Task Completed Successfully ")




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


📚 Sample Articles Loaded:

                                             Article
0  AI is transforming the healthcare industry wit...
1  The stock market saw major gains in the techno...
2  Researchers are using deep learning to analyze...
3  Political tensions are rising in the internati...
4  Advances in renewable energy are crucial for a... 

✅ Text Preprocessing Complete

                                             Article  \
0  AI is transforming the healthcare industry wit...   
1  The stock market saw major gains in the techno...   
2  Researchers are using deep learning to analyze...   
3  Political tensions are rising in the internati...   
4  Advances in renewable energy are crucial for a...   

                                             Cleaned  
0  [ai, transforming, healthcare, industry, impro...  
1  [stock, market, saw, major, gain, technology, ...  
2  [researcher, using, deep, learning, analyze, c...  
3  [political, tension, rising, international, co...  
4  [advanc