In [None]:
import numpy as np
import pandas as pd
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')

In [2]:
df = pd.read_csv('watch_reviews.tsv', sep = '\t', on_bad_lines = 'skip')

In [None]:
df.head()

In [4]:
df.dropna(subset = ['review_body'], inplace = True)

In [5]:
df.reset_index(inplace = True, drop = True)

In [None]:
df.info()

In [7]:
data = df.loc[:999, 'review_body'].tolist()

In [None]:
data

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append("'s")
stopwords.append("'m")
stopwords.append("br") # line break in HTML
stopwords.append("watch") # all reviews are about watches

print("We use " + str(len(stopwords)) + " stop-words from nltk library.")
print(stopwords[:10])

In [10]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english') # Porter2 stemming algorithm

def tokenization_and_stemming(text):
    tokens = []

    for word in nltk.word_tokenize(text):
        if word.lower() not in stopwords:
            tokens.append(word.lower())

    filtered_tokens = []

    for token in tokens:
        if token.isalpha():
            filtered_tokens.append(token)

    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [None]:
data[0]

In [None]:
tokenization_and_stemming(data[0])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_model = TfidfVectorizer(max_df = 0.99, max_features = 1000,
                              min_df = 0.01, stop_words = 'english',
                              use_idf = True, tokenizer = tokenization_and_stemming,
                              ngram_range = (1, 1)) 

tfidf_matrix = tfidf_model.fit_transform(data)

print("In total, there are " + str(tfidf_matrix.shape[0]) + \
      " reviews and " + str(tfidf_matrix.shape[1]) + ' terms.')

In [None]:
tfidf_matrix

In [None]:
tfidf_matrix.toarray()

In [None]:
tfidf_matrix.toarray()[0]

In [18]:
tf_selected_words = tfidf_model.get_feature_names_out()

In [None]:
tf_selected_words

Part 4: K-means clustering

In [None]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters = num_clusters)
km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [21]:
product = {'review': df[:1000].review_body, 'cluster': clusters}

frame = pd.DataFrame(product, columns = ['review', 'cluster'])

In [None]:
frame.head(10)

In [None]:
print('Number of reviews included in each cluster:')
frame['cluster'].value_counts().to_frame()

In [None]:
km.cluster_centers_

In [None]:
km.cluster_centers_.shape

In [None]:
print('<Document clustering result by K-means>')

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

Cluster_keywords_summary = {}
for i in range(num_clusters):
    print('Cluster ' + str(i) + ' words:', end = '')
    Cluster_keywords_summary[i] = []
    for ind in order_centroids[i, :6]:
        Cluster_keywords_summary[i].append(tf_selected_words[ind])
        print(tf_selected_words[ind] + ',', end = '')
    print()

    cluster_reviews = frame[frame.cluster == i].review.tolist()
    print('Cluster ' + str(i) + ' reviews (' + str(len(cluster_reviews)) + ' reviews): ')
    print(', '.join(cluster_reviews))
    print()

In [27]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components = 5)

In [None]:
lda_output = lda.fit_transform(tfidf_matrix)
print(lda_output.shape)
print(lda_output)

In [None]:
topic_word = lda.components_
print(topic_word.shape)
print(topic_word)

In [None]:
topic_names = ['Topic' + str(i) for i in range(lda.n_components)]

doc_names = ['Doc' + str(i) for i in range(len(data))]

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns = topic_names, index = doc_names)

topic = np.argmax(df_document_topic.values, axis = 1)

df_document_topic['topic'] = topic

df_document_topic.head(10)

In [None]:
df_document_topic['topic'].value_counts().to_frame()

In [None]:
print(lda.components_)

df_topic_words = pd.DataFrame(lda.components_)

df_topic_words.columns = tfidf_model.get_feature_names_out()
df_topic_words.index = topic_names

df_topic_words.head()

In [None]:
def print_topic_words(tfidf_model, lda_model, n_words):
    words = np.array(tfidf_model.get_feature_names_out())
    topic_words = []

    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words))
    return topic_words

topic_keywords = print_topic_words(tfidf_model = tfidf_model, lda_model = lda, n_words = 15)

df_topic_words = pd.DataFrame(topic_keywords)
df_topic_words.columns = ['Word ' + str(i) for i in range(df_topic_words.shape[1])]
df_topic_words.index = ['Topic ' + str(i) for i in range(df_topic_words.shape[0])]
df_topic_words