In [5]:
# Import necessary libraries
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Function to load research articles dataset
def load_data():
    df = pd.read_csv('data/research_articles.csv')
    return df['TITLE'] + ' ' + df['ABSTRACT']
    
# Function to vectorize text data using TfidfVectorizer
def vectorize_text_tfidf(text):
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    X_vec = vectorizer.fit_transform(text)
    return X_vec, vectorizer
    
# Function to train a Latent Dirichlet Allocation (LDA) model
def train_lda_model(X_vec, num_topics):
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda_model.fit(X_vec)
    return lda_model
    
# Function to display the top words for each topic
def display_topics(model, feature_names, num_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        topics[f"Topic {topic_idx+1}"] = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
    return topics
    
# Main function for Topic Modeling
def main_topic_modeling(text, num_topics=5, num_top_words=10):
    # Step 1: Vectorize text data using TfidfVectorizer
    X_vec, vectorizer = vectorize_text_tfidf(text)
    # Step 2: Train a Latent Dirichlet Allocation (LDA) model
    lda_model = train_lda_model(X_vec, num_topics)
    # Step 3: Display the top words for each topic
    feature_names = vectorizer.get_feature_names_out()
    topics = display_topics(lda_model, feature_names, num_top_words)
    # Display the topics
    print(f"\nTop {num_top_words} words for each topic:")
    for topic, words in topics.items():
        print(f"{topic}: {', '.join(words)}")

if __name__ == "__main__":
    text_data = load_data()
    main_topic_modeling(text_data, num_topics=5, num_top_words=10)


Top 10 words for each topic:
Topic 1: quantum, energy, spin, model, magnetic, phase, field, time, temperature, wave
Topic 2: learning, data, model, network, networks, based, algorithm, models, neural, problem
Topic 3: mn, doping, floquet, fese, t_c, soc, kitaev, semimetals, mos2, verma
Topic 4: qa, nmf, hedging, opioid, password, gerrymandering, hashtags, triad, fuzzing, sequent
Topic 5: mathbb, prove, group, mathcal, finite, groups, theorem, spaces, algebra, space
