In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def preprocess_text(text):
    """Clean and preprocess text data."""
    text = text.lower()  #lowercase
    text = re.sub(r'\s+', ' ', text)  #single space
    text = re.sub(r'http\S+', '', text)  # RemoveURLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove other characters (alphabets chara)
    return text

def display_topics(model, feature_names, no_top_words):
    """Display the topics from an LDA model."""
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

def main():

    data_path = '/content/BlackLivesMatter.csv'  # Database path ekhane paste korba
    data = pd.read_csv(data_path)

    data = data.dropna(subset=['posts'])

    data['posts'] = data['posts'].apply(preprocess_text)

    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(data['posts'])

    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    lda.fit(tfidf)

    display_topics(lda, tfidf_vectorizer.get_feature_names_out(), 10)

if __name__ == "__main__":
    main()


Topic 0:
codicymru black death racism solidarity aberystwyth history opportunity harris lives
Topic 1:
black blm blazetv fearless blacklivesmatter basketball lives matter jasonwhitlock jordan
Topic 2:
blacklivesmatter blackwellness black bana bantu stacey keys skeys house beauty
Topic 3:
black lives matter blacklivesmatter people blm make blackhistory answer music
Topic 4:
chicago children lives viral suicide freedom blacklivesmatter thing police black
