In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [18]:
# Load the Amazon Alexa Reviews dataset
df = pd.read_csv('amazon_alexa.csv', encoding='ISO-8859-1')

In [19]:
# Preprocess the text data by tokenizing and removing stop words, punctuation, and numbers
texts = df['verified_reviews'].apply(lambda x: x.lower().split())
stop_words = set(['a', 'an', 'and', 'the', 'in', 'is', 'it', 'of', 'on', 'with', 'for', 'this', 'by', 'from', 'at', 'to', 'be', 'as', 'but', 'not', 'that', 'or', 'if', 'you', 'are', 'your', 'set', 'here', 'i', 'we', 'my', 'have', 'so', 'she', 'can', 'was', 'echo', 'up', 'will', 'alexa', 'very', 'really', 'use', 'out', 'just', 'would', 'amazon', 'dot', 'when', 'no', 'he', 'me', 'get', 'still'])  # Define your set of stop words
texts = [' '.join([word for word in doc if word not in stop_words and word.isalpha()]) for doc in texts]

# Convert the preprocessed text data to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Train the NMF model
num_topics = 10  # Number of topics to extract
nmf_model = NMF(n_components=num_topics, init='nndsvd', max_iter=200)
W = nmf_model.fit_transform(X)
H = nmf_model.components_

# Get the topic distribution for each document
doc_topics = W.argmax(axis=1)

# Print the topics and their corresponding keywords
feature_names = vectorizer.get_feature_names_out()
for i, topic in enumerate(H):
    print(f"Topic {i+1}:")
    print([feature_names[idx] for idx in topic.argsort()[:-11:-1]])
    print("\n")

# Get the most dominant topic for each document
df['dominant_topic'] = doc_topics

# Extract the dominant topic and its probability for each document
df['dominant_topic_prob'] = W.max(axis=1)

# Get the topic names for each topic ID
topic_names = []
for topic_id in doc_topics:
    topic_keywords = [feature_names[index] for index in nmf_model.components_[topic_id].argsort()[:-11:-1]]
    topic_names.append(topic_keywords[0])


# Add the topic names to the DataFrame
df['dominant_topic_name'] = topic_names

# Display the results
print(df[['verified_reviews', 'dominant_topic', 'dominant_topic_name']])

Topic 1:
['love', 'absolutely', 'kids', 'fire', 'wonderful', 'new', 'spot', 'our', 'having', 'much']


Topic 2:
['great', 'addition', 'device', 'purchase', 'buy', 'speaker', 'sound', 'has', 'looks', 'small']


Topic 3:
['easy', 'setup', 'install', 'smart', 'connect', 'how', 'super', 'its', 'what', 'enjoying']


Topic 4:
['works', 'well', 'perfect', 'better', 'excellent', 'perfectly', 'other', 'sounds', 'home', 'than']


Topic 5:
['music', 'all', 'one', 'play', 'more', 'our', 'do', 'what', 'bought', 'am']


Topic 6:
['like', 'new', 'other', 'do', 'about', 'far', 'almost', 'how', 'little', 'any']


Topic 7:
['sound', 'quality', 'better', 'than', 'fun', 'much', 'has', 'nice', 'impressed', 'speaker']


Topic 8:
['good', 'speaker', 'sounds', 'pretty', 'far', 'new', 'device', 'quality', 'small', 'well']


Topic 9:
['product', 'nice', 'amazing', 'excellent', 'fast', 'awesome', 'music', 'satisfied', 'enjoyment', 'understands']


Topic 10:
['everything', 'does', 'about', 'expected', 'what', 'pe

In [20]:
df

Unnamed: 0,rating,date,variation,verified_reviews,dominant_topic,dominant_topic_prob,dominant_topic_name
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,0,0.271391,love
1,5,31-Jul-18,Charcoal Fabric,Loved it!,9,0.021410,everything
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",5,0.051923,like
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,6,0.071684,sound
4,5,31-Jul-18,Charcoal Fabric,Music,4,0.099495,music
...,...,...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",9,0.011512,everything
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",4,0.035193,music
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",4,0.070623,music
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,6,0.073690,sound
