**Import necessary library**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
import spacy
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis.gensim_models
import pyLDAvis


**load dataset**

In [2]:
df = pd.read_csv("/content/Reviews.csv", engine='python', on_bad_lines='skip')

In [3]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


**Preprocess text**

In [4]:
df = df[['Text']].dropna()

In [5]:
stop_words = stopwords.words('english')
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [6]:
# Tokenize and remove stopwords
def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in stop_words and len(token) > 3:
            result.append(token)
    return result

df['tokens'] = df['Text'].apply(preprocess)

# Lemmatize
def lemmatization(texts):
    output = []
    for doc in nlp.pipe([' '.join(tokens) for tokens in texts]):
        output.append([token.lemma_ for token in doc])
    return output

df['lemmatized'] = lemmatization(df['tokens'])

In [7]:
# Create dictionary
id2word = Dictionary(df['lemmatized'])

# Filter extremes (optional)
id2word.filter_extremes(no_below=5, no_above=0.5)

# Create corpus
corpus = [id2word.doc2bow(text) for text in df['lemmatized']]


In [8]:
lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=5,
    random_state=42,
    passes=10,
    per_word_topics=True
)

# Print topics
topics = lda_model.print_topics()
for topic in topics:
    print(topic)


(0, '0.039*"coffee" + 0.030*"taste" + 0.027*"flavor" + 0.023*"like" + 0.018*"good" + 0.014*"chocolate" + 0.012*"drink" + 0.011*"make" + 0.011*"well" + 0.010*"try"')
(1, '0.022*"taste" + 0.022*"like" + 0.018*"good" + 0.017*"flavor" + 0.014*"make" + 0.012*"great" + 0.011*"well" + 0.010*"love" + 0.009*"salt" + 0.008*"chip"')
(2, '0.028*"amazon" + 0.026*"product" + 0.023*"order" + 0.021*"price" + 0.019*"find" + 0.017*"store" + 0.014*"good" + 0.012*"great" + 0.012*"buy" + 0.011*"purchase"')
(3, '0.027*"food" + 0.021*"treat" + 0.014*"love" + 0.012*"like" + 0.011*"give" + 0.010*"dog" + 0.008*"would" + 0.008*"make" + 0.007*"cat" + 0.007*"time"')
(4, '0.016*"product" + 0.014*"sugar" + 0.012*"taste" + 0.011*"water" + 0.010*"like" + 0.010*"drink" + 0.009*"make" + 0.008*"milk" + 0.008*"ingredient" + 0.008*"syrup"')


**LDA**

In [10]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
pyLDAvis.display(vis)