# Topic Modeling
## K-means Clustering
This notebook uses k-means clustering to create topic models. We will first clean the data, then segment the reviews based on good and bad ratings.

## Stopwords Cleansing

In [2]:
import json
reviews = json.load(open('sperryreviews.json', 'r'))

In [3]:
from nltk.corpus import stopwords # ignore unimportant words like a, of, etc.
from sklearn.feature_extraction.text import TfidfVectorizer # meaningful words that occur frequently within data
from sklearn.cluster import KMeans # k means clustering

In [4]:
# load stopwords
stop_words = stopwords.words('english')
stop_words.append('sperry')

In [5]:
# create empty set
texts = set()

In [6]:
# function to format review data
def load_texts(topicdata):
    for review in topicdata:
        if 'reviewText' in topicdata[review]:
            reviewtext = topicdata[review]['reviewText']
            summary = topicdata[review]['summary']
            asin = topicdata[review]['asin']
    
            review = '%s %s %s' % (asin, summary, reviewtext)
                
            texts.add(review)

In [8]:
# run function           
load_texts(reviews)

In [9]:
# coerce to list
documents = list(texts)

In [10]:
# bring stopwords into model
vectorizer = TfidfVectorizer(stop_words=stop_words)
x = vectorizer.fit_transform(documents)

## K-Means Clustering

In [17]:
# establish number of topics
true_k = 4

In [18]:
# fit model
model = KMeans(n_clusters=true_k, max_iter=100000)
model.fit(x)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100000,
    n_clusters=4, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [19]:
# find top terms per cluster
print('Top terms per cluster')
order_centroids = model.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names()

for i in range(true_k):
    topic_terms = [terms[ind] for ind in order_centroids[i,:4]]
    print('%d: %s' % (i, ' '.join(topic_terms)))

Top terms per cluster
0: love shoes comfortable fit
1: size big shoes shoe
2: shoes shoe comfortable pair
3: great shoe shoes fit


In [20]:
# classify documents
import os
outputfiles = {}

# initialize directory only once
try:
    os.mkdir('output')

except OSError:
    print('directory already exists')
    
else:
    print('successfully created the directory')

successfully created the directory


In [21]:
# extract terms for each topic in model
# :4 means take last four terms from variable
for topic in range(true_k):
    topic_terms = [terms[ind] for ind in order_centroids[topic, :4]]
    # create empty output files for each topic
    outputfiles[topic] = open(os.path.join('output', '_'.join(topic_terms) + '.txt'), 'w')

In [22]:
# combine metadata with reviewdata and write to file
for review in reviews:
    if 'reviewText' in reviews[review]:
        thereview = reviews[review]
        review_str = '%s %s %s' % (thereview['asin'], thereview['summary'], thereview['reviewText'])
        Y = vectorizer.transform([review_str])
        
        # reviews can have multiple classifications
        for prediction in model.predict(Y):
            outputfiles[prediction].write('%s\n' % review_str)

In [23]:
# close all output files            
for n, f in outputfiles.items():
    f.close()

## Bad Reviews
Product reviews of 2 or fewer stars.

In [24]:
badreviews = set()

def bad_review(topicdata):
    for review in topicdata:
        if 'reviewText' in topicdata[review]:
            if 'overall' in topicdata[review]:
            
                if int(topicdata[review]['overall']) <= 2:
                
                    reviewtext = topicdata[review]['reviewText']
                    summary = topicdata[review]['summary']
                    asin = topicdata[review]['asin']
        
                    review = '%s %s %s' % (asin, summary, reviewtext)
                    
                    badreviews.add(review)

In [26]:
bad_review(reviews)
documents=list(badreviews)

In [27]:
# stopwords
vectorizer=TfidfVectorizer(stop_words=stop_words)
X=vectorizer.fit_transform(documents)

In [28]:
true_k=4

In [29]:
# fit model
model=KMeans(n_clusters=true_k,max_iter=100000)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100000,
    n_clusters=4, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [30]:
print("Top terms per Cluster")

order_centroids=model.cluster_centers_.argsort()[:,::-1]
terms=vectorizer.get_feature_names()

for i in range(true_k):
    topic_terms=[terms[ind]for ind in order_centroids[i,:10]]
    print('%d: %s' %(i,' '.join(topic_terms)))

Top terms per Cluster
0: size small big ordered shoe fit shoes way large 10
1: shoes shoe color return like pair would buy comfortable 34
2: quality shoes shoe socks apart wear pair good one poor
3: narrow wide fit shoe width shoes foot small tight back


In [31]:
#output of bad reviews
import os
outfiles={}

try:
  os.mkdir('badoutput')

except OSError:
    print ('directory already exists')

else:
    print ('Successfully created the directory')

Successfully created the directory


In [32]:
# combine data
for atopic in range(true_k):
    topicterms = [terms[ind] for ind in order_centroids[atopic, :4]]
    outfiles[atopic] = open(os.path.join('badoutput', '_'.join(topicterms) + '.txt'), 'w')

for areview in reviews:
    if 'reviewText' in reviews[areview]:
        thereview = reviews[areview]
        reviewwithmetadata = '%s %s %s' % (thereview['asin'], thereview['summary'], thereview['reviewText'])
        Y = vectorizer.transform([reviewwithmetadata])
        for prediction in model.predict(Y):
            outfiles[prediction].write('%s\n' % reviewwithmetadata)
            
for n, f in outfiles.items():
    f.close()

## Good Reviews

In [33]:
goodreviews = set()

def good_review(topicdata):
    for review in topicdata:
        if 'reviewText' in topicdata[review]:
            if 'overall' in topicdata[review]:
            
            # star reviews
                if int(topicdata[review]['overall']) >= 4:
                
                    reviewtext = topicdata[review]['reviewText']
                    summary = topicdata[review]['summary']
                    asin = topicdata[review]['asin']
        
                    review = '%s %s %s' % (asin, summary, reviewtext)
                    
                    goodreviews.add(review)

In [34]:
good_review(reviews)
documents=list(goodreviews)

In [35]:
# stopwords
vectorizer=TfidfVectorizer(stop_words=stop_words)
X=vectorizer.fit_transform(documents)

In [36]:
true_k=2

In [37]:
# fit model
model=KMeans(n_clusters=true_k,max_iter=100000)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100000,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [38]:
print("Top terms per Cluster")

order_centroids=model.cluster_centers_.argsort()[:,::-1]
terms=vectorizer.get_feature_names()

for i in range(true_k):
    topic_terms=[terms[ind]for ind in order_centroids[i,:5]]
    print('%d: %s' %(i,' '.join(topic_terms)))

Top terms per Cluster
0: shoes great shoe love comfortable
1: size shoes shoe fit big


In [39]:
#output of good reviews
import os
outfiles={}

try:
  os.mkdir('goodoutput')

except OSError:
    print ('directory already exists')

else:
    print ('Successfully created the directory')

for atopic in range(true_k):
    topicterms = [terms[ind] for ind in order_centroids[atopic, :4]]
    outfiles[atopic] = open(os.path.join('goodoutput', '_'.join(topicterms) + '.txt'), 'w')

for areview in reviews:
    if 'reviewText' in reviews[areview]:
        thereview = reviews[areview]
        reviewwithmetadata = '%s %s %s' % (thereview['asin'], thereview['summary'], thereview['reviewText'])
        Y = vectorizer.transform([reviewwithmetadata])
        for prediction in model.predict(Y):
            outfiles[prediction].write('%s\n' % reviewwithmetadata)
            
for n, f in outfiles.items():
    f.close()

Successfully created the directory
