In [1]:
import pandas as pd
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim import corpora

In [46]:
# Load the Amazon Alexa Reviews dataset
df = pd.read_csv('amazon_alexa.csv', encoding='ISO-8859-1')

In [48]:
# Preprocess the text data by tokenizing and removing stop words, punctuation, and numbers
texts = df['verified_reviews'].apply(lambda x: x.lower().split())
stop_words = set(['a', 'an', 'and', 'the', 'in', 'is', 'it', 'of', 'on', 'with', 'for', 'this', 'by', 'from', 'at', 'to', 'be', 'as', 'but', 'not', 'that', 'or', 'if', 'you', 'are', 'your', 'set', 'here', 'i', 'we', 'my', 'have', 'so', 'she', 'can', 'was', 'echo', 'up', 'will', 'alexa', 'very', 'really', 'use', 'out', 'just', 'would', 'amazon', 'dot', 'when', 'no', 'he', 'me', 'get', 'still'])  # Define your set of stop words
texts = [[word for word in doc if word not in stop_words and word.isalpha()] for doc in texts]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(texts)

# Convert the documents to a bag-of-words representation
corpus = [dictionary.doc2bow(text) for text in texts]

# Train the LDA model
num_topics = 10  # Number of topics to extract
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10)

# Get the topic distribution for each document
doc_topics = lda_model.get_document_topics(corpus)

# Print the topics and their corresponding keywords
for i in range(num_topics):
    print(f"Topic {i+1}:")
    print(lda_model.show_topic(i))
    print("\n")

# Get the most dominant topic for each document
dominant_topic = [max(topic, key=lambda x: x[1])[0] if len(topic) > 0 else None for topic in doc_topics]
df['dominant_topic'] = dominant_topic

# Extract the dominant topic and its probability for each document
df['dominant_topic_prob'] = [topic[dominant_topic[i]][1] if dominant_topic[i] is not None and len(topic) > 0 and dominant_topic[i] < len(topic) else None for i, topic in enumerate(doc_topics)]


Topic 1:
[('product', 0.028055802), ('great', 0.018810965), ('best', 0.015707834), ('some', 0.014537852), ('like', 0.013966958), ('amazing', 0.013446967), ('nice', 0.012185711), ('has', 0.011331021), ('speaker', 0.009851323), ('devices', 0.009728698)]


Topic 2:
[('great', 0.022333335), ('works', 0.0154933985), ('all', 0.015219325), ('time', 0.014432871), ('like', 0.012481828), ('new', 0.012173928), ('alarm', 0.009828739), ('one', 0.0097851055), ('had', 0.009566674), ('buy', 0.008904813)]


Topic 3:
[('music', 0.02688509), ('play', 0.018117102), ('prime', 0.01702956), ('works', 0.010980732), ('because', 0.009566748), ('like', 0.009206126), ('any', 0.008898834), ('need', 0.00875491), ('ask', 0.008687715), ('had', 0.008499524)]


Topic 4:
[('love', 0.0327928), ('has', 0.013720461), ('all', 0.012814442), ('like', 0.012603235), ('her', 0.01237666), ('our', 0.011884727), ('one', 0.011603826), ('do', 0.011167683), ('music', 0.009463147), ('what', 0.009440353)]


Topic 5:
[('like', 0.04668486

In [49]:
# Get the topic names for each topic ID
topic_names = []
for topic_id in dominant_topic:
    if topic_id is not None and len(lda_model.show_topic(topic_id)) > 0:
        topic_names.append(lda_model.show_topic(topic_id)[0][0])
    else:
        topic_names.append(None)

# Add the topic names to the DataFrame
df['dominant_topic_name'] = topic_names

# Display the results
print(df[['verified_reviews', 'dominant_topic', 'dominant_topic_name']])

                                       verified_reviews  dominant_topic  \
0                                         Love my Echo!               6   
1                                             Loved it!               7   
2     Sometimes while playing a game, you can answer...               2   
3     I have had a lot of fun with this thing. My 4 ...               2   
4                                                 Music               6   
...                                                 ...             ...   
3145  Perfect for kids, adults and everyone in betwe...               5   
3146  Listening to music, searching locations, check...               4   
3147  I do love these things, i have them running my...               5   
3148  Only complaint I have is that the sound qualit...               2   
3149                                               Good               9   

     dominant_topic_name  
0                   love  
1                    one  
2                 

In [50]:
df

Unnamed: 0,rating,date,variation,verified_reviews,dominant_topic,dominant_topic_prob,dominant_topic_name
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,6,0.549988,love
1,5,31-Jul-18,Charcoal Fabric,Loved it!,7,0.549968,one
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",2,,music
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,2,,music
4,5,31-Jul-18,Charcoal Fabric,Music,6,0.549952,love
...,...,...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",5,0.774990,great
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",4,0.626219,like
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",5,,great
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,2,0.444325,music
