In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

# Load the CSV file into a pandas DataFrame
csv_file_path = 'preprocessing/preprocessed_sample.csv'
data = pd.read_csv(csv_file_path)

# Drop rows with NaN values in the 'corrected_review' column
data = data.dropna(subset=['corrected_review'])

# Convert the processed reviews to a list of strings
documents = list(data['corrected_review'])

# Create a CountVectorizer to convert text into a bag-of-words representation
vectorizer = CountVectorizer(max_features=1000, max_df=0.8, stop_words='english')
X = vectorizer.fit_transform(documents)

# Perform Latent Dirichlet Allocation (LDA)
num_topics = 5  # You can adjust this value
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)

# Display the topics and their top words
feature_names = vectorizer.get_feature_names_out()

# Keep track of used words
used_words = set()

for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx + 1}:")
    top_words_idx = topic.argsort()[:-10 - 1:-1]
    top_words = [feature_names[i] for i in top_words_idx if feature_names[i] not in used_words]
    
    # Add top words to used words set
    used_words.update(top_words)
    
    print(", ".join(top_words))
    print("=" * 40)


Topic 1:
awesome, phone, apply, happy, siphon, love, product, colour, flippant, got
Topic 2:
nice, 14, camera, perform, quality, amaze, super, money
Topic 3:
good, great, battery, overall, heat, display
Topic 4:
excel, thank, deliver, superb, fast
Topic 5:
best, android, better, use
