### Part I: Set up Google Drive Environment



In [46]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [47]:
file = drive.CreateFile({'id':'####'}) # put your id 
file.GetContentFile('review_data.tsv')  

### Part II: Load the data and libraries

In [48]:
import numpy as np
import pandas as pd
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
# download punctuation package
nltk.download('punkt') 
# download stopwords package
nltk.download('stopwords') 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [49]:
# Load the data into dataframe
df = pd.read_csv('review_data.tsv', sep = '\t', error_bad_lines = False)

b'Skipping line 8704: expected 15 fields, saw 22\nSkipping line 16933: expected 15 fields, saw 22\nSkipping line 23726: expected 15 fields, saw 22\n'
b'Skipping line 85637: expected 15 fields, saw 22\n'
b'Skipping line 132136: expected 15 fields, saw 22\nSkipping line 158070: expected 15 fields, saw 22\nSkipping line 166007: expected 15 fields, saw 22\nSkipping line 171877: expected 15 fields, saw 22\nSkipping line 177756: expected 15 fields, saw 22\nSkipping line 181773: expected 15 fields, saw 22\nSkipping line 191085: expected 15 fields, saw 22\nSkipping line 196273: expected 15 fields, saw 22\nSkipping line 196331: expected 15 fields, saw 22\n'
b'Skipping line 197000: expected 15 fields, saw 22\nSkipping line 197011: expected 15 fields, saw 22\nSkipping line 197432: expected 15 fields, saw 22\nSkipping line 208016: expected 15 fields, saw 22\nSkipping line 214110: expected 15 fields, saw 22\nSkipping line 244328: expected 15 fields, saw 22\nSkipping line 248519: expected 15 fields,

In [50]:
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,3653882,R3O9SGZBVQBV76,B00FALQ1ZC,937001370,"Invicta Women's 15150 ""Angel"" 18k Yellow Gold ...",Watches,5,0,0,N,Y,Five Stars,Absolutely love this watch! Get compliments al...,2015-08-31
1,US,14661224,RKH8BNC3L5DLF,B00D3RGO20,484010722,Kenneth Cole New York Women's KC4944 Automatic...,Watches,5,0,0,N,Y,I love thiswatch it keeps time wonderfully,I love this watch it keeps time wonderfully.,2015-08-31
2,US,27324930,R2HLE8WKZSU3NL,B00DKYC7TK,361166390,Ritche 22mm Black Stainless Steel Bracelet Wat...,Watches,2,1,1,N,Y,Two Stars,Scratches,2015-08-31
3,US,7211452,R31U3UH5AZ42LL,B000EQS1JW,958035625,Citizen Men's BM8180-03E Eco-Drive Stainless S...,Watches,5,0,0,N,Y,Five Stars,"It works well on me. However, I found cheaper ...",2015-08-31
4,US,12733322,R2SV659OUJ945Y,B00A6GFD7S,765328221,Orient ER27009B Men's Symphony Automatic Stain...,Watches,4,0,0,N,Y,"Beautiful face, but cheap sounding links",Beautiful watch face. The band looks nice all...,2015-08-31


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960204 entries, 0 to 960203
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   marketplace        960204 non-null  object
 1   customer_id        960204 non-null  int64 
 2   review_id          960204 non-null  object
 3   product_id         960204 non-null  object
 4   product_parent     960204 non-null  int64 
 5   product_title      960202 non-null  object
 6   product_category   960204 non-null  object
 7   star_rating        960204 non-null  int64 
 8   helpful_votes      960204 non-null  int64 
 9   total_votes        960204 non-null  int64 
 10  vine               960204 non-null  object
 11  verified_purchase  960204 non-null  object
 12  review_headline    960197 non-null  object
 13  review_body        960056 non-null  object
 14  review_date        960200 non-null  object
dtypes: int64(5), object(10)
memory usage: 109.9+ MB


In [52]:
# Remove missing value, since we can't impute missing value for review
df.dropna(subset=['review_body'], inplace=True)

In [53]:
df.reset_index(inplace=True, drop = True) # reset the index

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960056 entries, 0 to 960055
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   marketplace        960056 non-null  object
 1   customer_id        960056 non-null  int64 
 2   review_id          960056 non-null  object
 3   product_id         960056 non-null  object
 4   product_parent     960056 non-null  int64 
 5   product_title      960054 non-null  object
 6   product_category   960056 non-null  object
 7   star_rating        960056 non-null  int64 
 8   helpful_votes      960056 non-null  int64 
 9   total_votes        960056 non-null  int64 
 10  vine               960056 non-null  object
 11  verified_purchase  960056 non-null  object
 12  review_headline    960049 non-null  object
 13  review_body        960056 non-null  object
 14  review_date        960052 non-null  object
dtypes: int64(5), object(10)
memory usage: 109.9+ MB


In [55]:
# Subset first 5000 rows for my analysis
data = df.loc[:4999, 'review_body'].tolist()

In [56]:
data

['Absolutely love this watch! Get compliments almost every time I wear it. Dainty.',
 'I love this watch it keeps time wonderfully.',
 'Scratches',
 'It works well on me. However, I found cheaper prices in other places after making the purchase',
 "Beautiful watch face.  The band looks nice all around.  The links do make that squeaky cheapo noise when you swing it back and forth on your wrist which can be embarrassing in front of watch enthusiasts.  However, to the naked eye from afar, you can't tell the links are cheap or folded because it is well polished and brushed and the folds are pretty tight for the most part.<br /><br />I love the new member of my collection and it looks great.  I've had it for about a week and so far it has kept good time despite day 1 which is typical of a new mechanical watch",
 'i love this watch for my purpose, about the people complaining should of done their research better before buying. dumb people.',
 'for my wife and she loved it, looks great and a 

### Part III: Tokenizing and Stemming

In [57]:
# Use nltk's English stopwords 
stopwords = nltk.corpus.stopwords.words('english')


In [58]:
# Append self-created stopwords
stopwords.append("'s")
stopwords.append("'m")
stopwords.append("br")
stopwords.append("watch")


In [59]:
print ("We use " + str(len(stopwords)) + " stop-words from nltk library")
print (stopwords[:10])

We use 183 stop-words from nltk library
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


Define a function to tokenize and stem our reviews

In [60]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

# Define a function to tokenize and stem
def tokenization_and_stemming(text):
  tokens = []
  # exclude stop words and tokenize the document, generate a list of string
  for word in nltk.word_tokenize(text):
    if word.lower() not in stopwords:
      tokens.append(word.lower())
    
  filtered_tokens = []
  # filter out any tokens that are not words, i.e numbers, emoji etc
  for token in tokens:
    if token.isalpha(): # just keep the tokens that are words
      filtered_tokens.append(token)
  # stemming
  stems = [stemmer.stem(t) for t in filtered_tokens]
  return stems

In [61]:
tokenization_and_stemming(data[0])

['absolut',
 'love',
 'get',
 'compliment',
 'almost',
 'everi',
 'time',
 'wear',
 'dainti']

### Part IV: TF-IDF

In [62]:
# Build tf-idf vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_model = TfidfVectorizer(max_df = 0.99, max_features = 1000,
                              min_df = 0.01, stop_words = 'english', 
                              use_idf = True, tokenizer = tokenization_and_stemming, 
                              ngram_range = (1,1))



In [63]:
tfidf_matrix = tfidf_model.fit_transform(data) 

  'stop_words.' % sorted(inconsistent))


In [64]:
print ("In total, there are " + str(tfidf_matrix.shape[0]) + \
       " reviews and " + str(tfidf_matrix.shape[1]) + " terms.")

In total, there are 5000 reviews and 234 terms.


In [65]:
tfidf_matrix.toarray() 

array([[0.        , 0.52862754, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [66]:
# Save terms identified by tf-idf
tf_selected_words = tfidf_model.get_feature_names()

In [67]:
tf_selected_words

['abl',
 'absolut',
 'accur',
 'actual',
 'adjust',
 'alarm',
 'alreadi',
 'alway',
 'amaz',
 'amazon',
 'anoth',
 'anyth',
 'appear',
 'arriv',
 'attract',
 'automat',
 'awesom',
 'bad',
 'band',
 'batteri',
 'beauti',
 'best',
 'better',
 'big',
 'bit',
 'black',
 'blue',
 'bought',
 'box',
 'bracelet',
 'brand',
 'break',
 'broke',
 'button',
 'buy',
 'ca',
 'came',
 'case',
 'casio',
 'chang',
 'cheap',
 'clasp',
 'classi',
 'clear',
 'clock',
 'color',
 'come',
 'comfort',
 'compliment',
 'cool',
 'cost',
 'coupl',
 'crystal',
 'cute',
 'dark',
 'date',
 'daughter',
 'day',
 'deal',
 'definit',
 'deliveri',
 'design',
 'dial',
 'differ',
 'difficult',
 'digit',
 'disappoint',
 'display',
 'durabl',
 'easi',
 'easili',
 'eleg',
 'end',
 'everi',
 'everyday',
 'everyth',
 'exact',
 'excel',
 'expect',
 'expens',
 'face',
 'far',
 'fast',
 'favorit',
 'featur',
 'feel',
 'fell',
 'fine',
 'fit',
 'function',
 'gave',
 'gift',
 'glass',
 'goe',
 'gold',
 'good',
 'got',
 'great',
 'ha

### Part V: Topic Modelling - Latent Dirichlet Allocation

In [68]:
# Use LDA for clustering
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components = 5) # Divide to 5 topics

In [69]:
# document topic matrix for tfidf_matrix _lda
lda_output = lda.fit_transform(tfidf_matrix)
print(lda_output.shape)
print(lda_output)

(5000, 5)
[[0.06165284 0.05942584 0.05995579 0.50961369 0.30935185]
 [0.090843   0.08453656 0.08853776 0.65022976 0.08585292]
 [0.10224939 0.10000324 0.59774175 0.10000292 0.10000271]
 ...
 [0.1028781  0.10007366 0.10155001 0.10000084 0.59549739]
 [0.06078251 0.22833713 0.22431944 0.42366207 0.06289885]
 [0.08425744 0.0841967  0.08421    0.6629514  0.08438446]]


In [70]:
# topic and words matrix
topic_word = lda.components_
print(topic_word.shape)
print(topic_word)

(5, 234)
[[17.88321039  7.49263141 18.90987954 ... 14.1376626  58.22048214
  53.7196922 ]
 [ 0.20035393  0.20077259  0.2037901  ...  0.20132927  0.20179274
   0.29214242]
 [ 2.73518371  0.2007668   6.68571367 ...  0.20280516 27.65478098
   0.20407801]
 [ 0.20044395  0.20465019  0.20190757 ...  0.20043281  1.36864484
   0.2124158 ]
 [ 1.33559966 21.83239455  0.56706267 ... 23.86837039  6.95970859
   8.60340028]]


In [71]:
# Create column names
topic_names = ["Topic" + str(i) for i in range(lda.n_components)]
# Create index names
doc_names = ["Doc" + str(i) for i in range(len(data))]
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns = topic_names, index = doc_names)


In [72]:
df_document_topic.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4
Doc0,0.06,0.06,0.06,0.51,0.31
Doc1,0.09,0.08,0.09,0.65,0.09
Doc2,0.1,0.1,0.6,0.1,0.1
Doc3,0.76,0.06,0.06,0.06,0.06
Doc4,0.41,0.04,0.46,0.04,0.04


In [73]:
# Get dominant topic for each document
topic = np.argmax(df_document_topic.values, axis = 1)
df_document_topic['topic'] = topic
df_document_topic.head(10)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,topic
Doc0,0.06,0.06,0.06,0.51,0.31,3
Doc1,0.09,0.08,0.09,0.65,0.09,3
Doc2,0.1,0.1,0.6,0.1,0.1,2
Doc3,0.76,0.06,0.06,0.06,0.06,0
Doc4,0.41,0.04,0.46,0.04,0.04,2
Doc5,0.69,0.07,0.07,0.09,0.08,0
Doc6,0.06,0.06,0.06,0.07,0.74,4
Doc7,0.75,0.06,0.06,0.06,0.06,0
Doc8,0.81,0.05,0.05,0.05,0.05,0
Doc9,0.77,0.06,0.06,0.06,0.06,0


In [74]:
df_document_topic['topic'].value_counts().to_frame()

Unnamed: 0,topic
0,2009
2,954
1,739
4,686
3,612


In [75]:
# topic word matrix
print(lda.components_)
# topic-word matrix
df_topic_words = pd.DataFrame(lda.components_)

# column and index
df_topic_words.columns = tfidf_model.get_feature_names()
df_topic_words.index = topic_names
df_topic_words.head()



[[17.88321039  7.49263141 18.90987954 ... 14.1376626  58.22048214
  53.7196922 ]
 [ 0.20035393  0.20077259  0.2037901  ...  0.20132927  0.20179274
   0.29214242]
 [ 2.73518371  0.2007668   6.68571367 ...  0.20280516 27.65478098
   0.20407801]
 [ 0.20044395  0.20465019  0.20190757 ...  0.20043281  1.36864484
   0.2124158 ]
 [ 1.33559966 21.83239455  0.56706267 ... 23.86837039  6.95970859
   8.60340028]]


Unnamed: 0,abl,absolut,accur,actual,adjust,alarm,alreadi,alway,amaz,amazon,anoth,anyth,appear,arriv,attract,automat,awesom,bad,band,batteri,beauti,best,better,big,bit,black,blue,bought,box,bracelet,brand,break,broke,button,buy,ca,came,case,casio,chang,...,small,smaller,solid,someth,son,star,start,stop,strap,style,stylish,super,sure,tell,thank,thing,think,thought,time,timex,took,tri,turn,use,valu,want,watch,water,way,wear,week,weight,white,wife,wish,work,worn,worth,wrist,year
Topic0,17.88321,7.492631,18.90988,20.058076,14.856417,19.69409,18.332596,23.868203,39.958676,26.014957,20.775363,4.875508,11.090075,26.96728,24.391743,9.624245,0.204745,15.138523,82.691116,36.306286,20.751341,10.729672,26.334727,39.231456,24.837363,26.462477,15.294973,41.144436,10.335807,5.255357,15.780065,10.79022,0.203886,26.690182,38.349055,18.757443,16.455577,24.095131,24.465897,22.580145,...,52.372021,10.435616,1.153501,19.476568,0.24674,22.811851,18.463621,15.344221,19.106392,14.289775,2.679459,2.132796,22.145302,17.661333,0.205163,23.723674,25.720304,10.614051,106.335428,25.134346,9.396802,17.896378,10.116059,62.92039,4.707646,31.301971,63.960708,40.779114,20.741006,73.465476,23.606354,6.29826,20.429216,0.646211,21.012978,65.360705,11.104741,14.137663,58.220482,53.719692
Topic1,0.200354,0.200773,0.20379,2.030607,0.200866,0.200088,0.203251,0.202712,0.202696,5.463159,0.851561,0.204363,0.200658,11.62194,0.200348,0.20064,0.200695,20.409535,8.692447,0.200912,0.20172,0.212525,24.78468,0.202059,0.201292,0.709189,0.200402,0.205023,5.214039,0.200563,6.72625,1.352895,0.200347,0.20068,2.263594,0.201573,4.399261,0.201196,0.201021,0.200313,...,0.200831,0.201771,0.201277,2.840414,0.200733,0.835911,0.20012,0.226517,0.201231,4.972541,2.527947,1.512055,0.201245,0.200811,2.459175,0.65274,0.203751,0.258605,9.127459,0.200821,0.201255,0.204321,0.201025,0.925816,29.059666,35.928124,11.592367,0.201987,0.201923,0.315121,0.201896,0.201288,0.20052,0.200801,0.200482,57.00737,0.200783,0.201329,0.201793,0.292142
Topic2,2.735184,0.200767,6.685714,2.068968,17.379949,3.195158,1.216409,0.20121,0.200838,1.004161,2.063772,5.61025,2.917388,3.931058,0.595101,6.813343,0.200865,0.20363,47.058329,0.201793,4.095269,26.484577,2.998579,30.358175,16.615346,6.511097,14.855807,1.312075,8.297542,16.221641,0.201347,3.926512,0.20169,0.201272,6.960465,11.654532,6.925041,0.915454,1.573677,5.982535,...,22.119791,14.421016,21.320127,1.596074,0.201263,9.225916,0.200723,3.037708,12.494138,0.844303,0.202015,7.46181,0.366937,11.6732,0.209236,10.658463,4.540032,11.608799,34.862991,0.20248,2.16698,8.973663,9.324127,14.512291,0.257446,1.04307,6.99127,0.948732,10.992246,7.624472,6.716954,10.622745,0.200828,0.201757,4.217132,20.225002,6.635063,0.202805,27.654781,0.204078
Topic3,0.200444,0.20465,0.201908,0.203336,0.200651,0.203064,0.200813,3.407266,0.201888,2.875099,8.372419,0.202356,0.204503,0.204599,0.201837,0.201287,0.201186,0.201124,0.319262,0.565753,116.009601,1.356249,0.42989,4.589411,0.200821,0.200727,0.201166,25.286315,0.205052,0.200923,0.201563,0.201039,0.200278,0.200391,16.944577,3.119839,0.268779,0.201513,3.182103,0.200277,...,0.201014,0.20053,0.200773,0.201027,0.20147,0.201374,0.212382,0.201077,0.200835,2.83212,25.703902,7.299566,0.201267,0.20131,51.550541,0.20086,4.709962,0.202668,7.767536,0.200887,0.201747,0.200264,0.200465,0.201305,0.262351,0.200901,13.152185,0.200792,3.176742,8.796877,0.201691,9.808721,0.200666,0.202231,0.201894,4.055733,0.202725,0.200433,1.368645,0.212416
Topic4,1.3356,21.832395,0.567063,1.400956,0.201408,0.200276,0.20505,0.201164,0.202336,0.214855,5.509008,5.75337,5.889203,0.202762,0.200293,0.201276,58.116389,0.200722,27.895935,29.589608,0.205376,0.201401,0.273443,0.555474,1.468847,0.927915,0.201372,13.084615,9.683449,2.183395,0.203638,2.84486,49.481257,0.200423,12.854697,3.223553,16.650208,0.200827,0.202253,0.200928,...,0.200662,0.218713,0.200675,0.744117,30.762833,0.201389,0.20234,19.95357,35.797338,11.298295,0.220852,3.88659,3.271964,0.203499,0.248265,2.956673,0.320509,0.265385,13.946615,0.201958,7.025957,0.20309,0.201482,23.798661,0.201043,0.216187,0.917424,0.20098,0.201702,17.846643,21.961032,0.201769,0.201586,36.034213,2.769951,40.305171,0.201615,23.86837,6.959709,8.6034


In [76]:
# print top n keywords for each topic
def print_topic_words(tfidf_model, lda_model, n_words):
    words = np.array(tfidf_model.get_feature_names())
    topic_words = []
    # for each topic, we have words weight
    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words))
    return topic_words


In [77]:
topic_keywords = print_topic_words(tfidf_model = tfidf_model, lda_model = lda, n_words=15)

df_topic_words = pd.DataFrame(topic_keywords)
df_topic_words.columns = ['Word' + str(i) for i in range(df_topic_words.shape[1])]
df_topic_words.index = ['Topic ' + str(i) for i in range(df_topic_words.shape[0])]
df_topic_words

Unnamed: 0,Word0,Word1,Word2,Word3,Word4,Word5,Word6,Word7,Word8,Word9,Word10,Word11,Word12,Word13,Word14
Topic 0,time,look,like,band,recommend,wear,easi,work,watch,face,read,use,day,great,wrist
Topic 1,good,product,expect,qualiti,like,work,price,exact,look,want,great,nice,pictur,happi,valu
Topic 2,nice,excel,look,cheap,band,littl,like,time,realli,cute,link,big,eleg,wrist,price
Topic 3,love,beauti,perfect,gift,thank,cool,compliment,great,receiv,daughter,stylish,bought,item,pleas,lot
Topic 4,great,look,awesom,love,broke,work,husband,wife,strap,price,fast,son,ship,batteri,got


### Part VI K-means clustering

In [78]:
# k- means clustering
from sklearn.cluster import KMeans
num_clusters = 5
# number of clusters
km = KMeans(n_clusters = num_clusters)
km.fit(tfidf_matrix)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [79]:
clusters = km.labels_.tolist()


#### Analyze K-means Result

In [81]:
# Create DataFrame films from all of the input files
product = {'review': df[:5000].review_body, 'cluster': clusters}
frame = pd.DataFrame(product, columns = ['review', 'cluster'])

In [82]:
frame.head(10)

Unnamed: 0,review,cluster
0,Absolutely love this watch! Get compliments al...,4
1,I love this watch it keeps time wonderfully.,4
2,Scratches,1
3,"It works well on me. However, I found cheaper ...",1
4,Beautiful watch face. The band looks nice all...,1
5,"i love this watch for my purpose, about the pe...",1
6,"for my wife and she loved it, looks great and ...",2
7,I was about to buy this thinking it was a Swis...,1
8,Watch is perfect. Rugged with the metal &#34;B...,2
9,Great quality and build.<br />The motors are r...,1


In [83]:
print ("Number of reviews included in each cluster:")
frame['cluster'].value_counts().to_frame()

Number of reviews included in each cluster:


Unnamed: 0,cluster
1,3458
2,502
4,438
0,315
3,287


In [84]:
# Check the centers of each cluster
km.cluster_centers_

array([[0.        , 0.        , 0.00104262, ..., 0.00224199, 0.00511506,
        0.00101252],
       [0.00591508, 0.00462636, 0.00658705, ..., 0.00911419, 0.02429341,
        0.01570639],
       [0.00106122, 0.        , 0.00234634, ..., 0.00575306, 0.00708565,
        0.01143231],
       [0.        , 0.        , 0.00229825, ..., 0.00391981, 0.00460418,
        0.        ],
       [0.00083955, 0.02952797, 0.00142566, ..., 0.00313806, 0.00664191,
        0.00379238]])

In [85]:
km.cluster_centers_.shape

(5, 234)

In [86]:
#Denotes the importances of each items in centroid
# Sort it in decreasing-order and get the top k items
order_centroids = km.cluster_centers_.argsort()[:,::-1]


In [87]:
order_centroids.shape

(5, 234)

In [88]:
Cluster_keywords_summary = {}
for i in range(num_clusters):
    print ("Cluster " + str(i) + " words:", end='')
    Cluster_keywords_summary[i] = []
    for ind in order_centroids[i, :6]: #replace 5 with n words per cluster
        Cluster_keywords_summary[i].append(tf_selected_words[ind])
        print (tf_selected_words[ind] + ",", end='')
    print ()
    
    cluster_reviews = frame[frame.cluster==i].review.tolist()
    print ("Cluster " + str(i) + " reviews (" + str(len(cluster_reviews)) + " reviews): ")
    print (", ".join(cluster_reviews))
    print ()

Cluster 0 words:good,product,recommend,look,price,qualiti,
Cluster 0 reviews (315 reviews): 
very good, It's a good value, and a good functional watch strap.  It's super wide though, and takes more space on the wrist than I'd like., very good, Excellent product and seller very good service, Good value product, Looks good. Not bulky. Keeps good time., good, works good so far., good, very good., Good, Good quality with a reasonable price, good quality. on time, Good looking and smart! Works like a charm., Good for work, Very stylish; looks good on my wrist. Lots of compliments. A good buy., Good Product & good seller, It's not keeping good time, good, it 's good watch . I wear when I work. it light and look style.thanks, its good, Good!, good, Good, good, A little big, Good!, so far so good,  it is a good looking watch.  With all the necessary features one needs.  only drawback is the date window, the date is awfully small., Very good, very good, Good, Good product, If looking for a FUN 