In [18]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [19]:
# Load the Amazon Alexa Reviews dataset
df = pd.read_csv('amazon_alexa.csv', encoding='ISO-8859-1')

In [20]:
# Preprocess the text data by tokenizing and removing stop words, punctuation, and numbers
stop_words = set(['a', 'an', 'and', 'the', 'in', 'is', 'it', 'of', 'on', 'with', 'for', 'this', 'by', 'from', 'at', 'to', 'be', 'as', 'but', 'not', 'that', 'or', 'if', 'you', 'are', 'your', 'set', 'here', 'i', 'we', 'my', 'have', 'so', 'she', 'can', 'was', 'echo', 'up', 'will', 'alexa', 'very', 'really', 'use', 'out', 'just', 'would', 'amazon', 'dot', 'when', 'no', 'he', 'me', 'get', 'still'])  # Define your set of stop words

# Define a function for text preprocessing
def preprocess_text(text):
    # Convert to lowercase and split into words
    words = text.lower().split()
    # Remove stop words and non-alphabetic words
    words = [word for word in words if word not in stop_words and word.isalpha()]
    # Join words back into a string
    preprocessed_text = ' '.join(words)
    return preprocessed_text

# Apply text preprocessing to the 'verified_reviews' column in the DataFrame
df['preprocessed_reviews'] = df['verified_reviews'].apply(preprocess_text)

# Create a TF-IDF vectorizer to convert text data into a numerical representation
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['preprocessed_reviews'])

# Train the NMF model
num_topics = 10  # Number of topics to extract
nmf_model = NMF(n_components=num_topics, random_state=1)
nmf_model.fit(tfidf_matrix)

# Get the topic distribution for each document
doc_topics = nmf_model.transform(tfidf_matrix)

# Print the topics and their corresponding keywords
feature_names = vectorizer.get_feature_names()
for i, topic_weights in enumerate(nmf_model.components_):
    print(f"Topic {i+1}:")
    topic_keywords = [feature_names[index] for index in topic_weights.argsort()[:-11:-1]]
    print(topic_keywords)
    print("\n")

# Get the most dominant topic for each document
dominant_topic = doc_topics.argmax(axis=1)
df['dominant_topic'] = dominant_topic

# Extract the dominant topic and its probability for each document
df['dominant_topic_prob'] = doc_topics.max(axis=1)

# Get the topic names for each topic ID
topic_names = []
for topic_id in dominant_topic:
    topic_keywords = [feature_names[index] for index in nmf_model.components_[topic_id].argsort()[:-11:-1]]
    topic_names.append(topic_keywords[0])

# Add the topic names to the DataFrame
df['dominant_topic_name'] = topic_names

# Display the results
print(df[['verified_reviews', 'dominant_topic', 'dominant_topic_name']])




Topic 1:
['love', 'absolutely', 'kids', 'fire', 'wonderful', 'new', 'spot', 'our', 'having', 'much']


Topic 2:
['great', 'addition', 'device', 'purchase', 'buy', 'speaker', 'sound', 'has', 'looks', 'small']


Topic 3:
['easy', 'setup', 'install', 'smart', 'connect', 'how', 'super', 'its', 'what', 'enjoying']


Topic 4:
['works', 'well', 'perfect', 'better', 'excellent', 'perfectly', 'other', 'sounds', 'home', 'than']


Topic 5:
['music', 'all', 'one', 'play', 'more', 'our', 'do', 'what', 'bought', 'am']


Topic 6:
['like', 'new', 'other', 'do', 'about', 'far', 'almost', 'how', 'little', 'any']


Topic 7:
['sound', 'quality', 'better', 'than', 'fun', 'much', 'has', 'nice', 'impressed', 'speaker']


Topic 8:
['good', 'speaker', 'sounds', 'pretty', 'far', 'new', 'device', 'quality', 'small', 'well']


Topic 9:
['product', 'nice', 'amazing', 'excellent', 'fast', 'awesome', 'music', 'satisfied', 'enjoyment', 'understands']


Topic 10:
['everything', 'does', 'about', 'expected', 'what', 'pe

In [21]:
df

Unnamed: 0,rating,date,variation,verified_reviews,preprocessed_reviews,dominant_topic,dominant_topic_prob,dominant_topic_name
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,love,0,0.271407,love
1,5,31-Jul-18,Charcoal Fabric,Loved it!,loved,9,0.021389,everything
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",sometimes while playing answer question correc...,5,0.051940,like
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,had lot fun yr old learns about control lights...,6,0.071706,sound
4,5,31-Jul-18,Charcoal Fabric,Music,music,4,0.099493,music
...,...,...,...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",perfect adults everyone,9,0.011501,everything
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",listening searching checking looking there man...,4,0.035189,music
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",do love these them running entire all front do...,4,0.070621,music
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,only complaint sound quality mostly commands a...,6,0.073713,sound
