In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.preprocessing import label_binarize
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import accuracy_score, homogeneity_score, silhouette_score, completeness_score
from nltk.stem import WordNetLemmatizer
import nltk
import spacy 
import warnings 
warnings.filterwarnings("ignore")

Clustering similar reviews 

In [None]:
train = pd.read_csv("train.csv")

In [None]:
# Clustering similar reviews using KMeans
train['reviews.text']

0       purchase black fridaypros great price even sal...
1       purchase two amazon echo plus two dot plus fou...
2       average alexa option show things screen still ...
3                    good product exactly want good price
4       rd one purchase buy one nieces case compare on...
                              ...                        
3995    äôs fun family play may get bore newness wear ...
3996    love kindle great product reduce eye strain en...
3997    look blutooth speaker use phone want worry thi...
3998    second amazon fire tablet purchase time color ...
3999                        satisfy tablet fast efficient
Name: reviews.text, Length: 4000, dtype: object

In [None]:
vectorizer = TfidfVectorizer(max_features = 5000, max_df = 0.75, min_df=50)
tf_vectors = vectorizer.fit_transform(train['reviews.text'])

In [None]:
Kmeans_cls = MiniBatchKMeans(n_clusters=5)
Kmeans_cls.fit(tf_vectors)

MiniBatchKMeans(n_clusters=5)

In [None]:
labels_true = Kmeans_cls.labels_
labels_true

array([2, 1, 1, ..., 0, 2, 2], dtype=int32)

In [None]:
labels_pred = Kmeans_cls.predict(tf_vectors)
labels_pred

array([2, 1, 1, ..., 0, 2, 2], dtype=int32)

In [None]:
homogeneity_score (train["reviews.text"], labels_pred)

0.1860566611290765

In [None]:
completeness_score (train['reviews.text'], labels_pred)

0.9999999999999992

In [None]:
silhouette_score(tf_vectors, labels_pred)

0.016792044251747105

Notes:

> The silhoutte score indicates that there maybe some overlapping in the clusters. The model has low homogeneity. The almost perfect competeness score indicates that samples belonging to the same cluster where correctly assigned, perfect labelling




In [None]:
# Topic modelling 

In [None]:
# Processing text to lemmatize to nouns only for topic modelling 
spy = spacy.load('en_core_web_sm')
def only_nouns(texts):
  tokens = []
  for doc in spy.pipe(texts):
    noun_text = " ".join(token.lemma_ for token in doc if token.pos_ =='NOUN')
    tokens.append(noun_text)
  return tokens
train['reviews.text']= only_nouns(train['reviews.text'])

In [None]:
train["reviews.text"]

0       fridaypro price sale core processor soundwell ...
1       dot fire stick hub family purchase program new...
2                                     option thing screen
3                                           product price
4       purchase niece case hold protect tablet time drop
                              ...                        
3995                             family play newness äôll
3996                 love product eye strain enjoy kindle
3997    speaker phone thing account dread news month o...
3998          tablet purchase time color meet expectation
3999                                                     
Name: reviews.text, Length: 4000, dtype: object

In [None]:
vectorizer = TfidfVectorizer(max_features = 5000, max_df = 0.75, min_df=10)
features = vectorizer.fit_transform(train['reviews.text'])

In [None]:
lda = LatentDirichletAllocation(n_components=10, max_iter=3, learning_method='online', n_jobs = -1)
W1 = lda.fit_transform(tf_vectors)
H1 = lda.components_

In [None]:
num_words = 10
vocabulary = np.array(vectorizer.get_feature_names())
top_words = lambda t : [vocabulary[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H1])
topics=[" ".join(t) for t in topic_words]

In [None]:
topics

['entertainment enjoy lock complaint charger mine graphic gift camera app',
 'hd operate mode display drawback hub mini inch oasis news',
 'button help parent mini hour kind get drawback bedroom paperwhite',
 'mini answer graphic pair family display news compare drawback dot',
 'check adult line drawback connect get grandson deal love enjoy',
 'cover fan mini family get drawback content ebook holiday customer',
 'package internet control news alexa kid drop movie feature lyric',
 'chat news drawback download lightweight mini info game pair get',
 'difference get lot brand ease device bedroom child fit interest',
 'bed item job friend fan buy command disappoint guess couple']

In [None]:
# Perform topic modelling using Non Negative Matrix factorization NMF

In [None]:
nmf_model = NMF(n_components = 10)
W = nmf_model.fit_transform(features)
H = nmf_model.components_
feature_names = vectorizer.get_feature_names()

In [None]:
num_words = 10
for i, topic_vec in enumerate(H):
  print(i, end=" ")
  for fid in topic_vec.argsort()[-1:-num_words-1:-1]:
    print (feature_names[fid], end=" ")
  print()

0 tablet child fire app year time purchase need daughter store 
1 love year daughter wife granddaughter age grandson reader camera fact 
2 use set daughter fire fun day lot mom ease item 
3 product set child family service friend purchase fun quality parent 
4 echo music device screen video show dot home speaker feature 
5 gift wife purchase year mother son birthday granddaughter fire item 
6 kid app time year lot control case adult thing child 
7 book game play read year purchase size fire download movie 
8 price quality size value feature sale buy beat screen ipad 
9 work app set download issue item camera need enjoy problem 


In [None]:
# contents of each topic 
num_words = 10
vocabulary = np.array(vectorizer.get_feature_names())
top_words = lambda t : [vocabulary[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H])
topics=[" ".join(t) for t in topic_words]

In [None]:
topics 

['echo alexa show music amazon home sound like plus light',
 'tablet need price amazon fire apps nice perfect game daughter',
 'love daughter gift absolutely get christmas son wife granddaughter grandson',
 'use easy set product light purchase fun every really day',
 'kindle read book fire like size screen light much new',
 'great work price product sound well recommend battery item quality',
 'buy one best gift christmas get wife would replace another',
 'good product recommend price would quality pretty excellent sound really',
 'old year purchase game grandson play perfect granddaughter son years',
 'kid apps game lot play like little friendly really enjoy']