In [5]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from gensim import corpora, models, similarities, matutils
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import logging

# logging for gensim (set to INFO)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [6]:
cnx = create_engine('postgresql://ubuntu:ubuntu@ec2-18-220-144-212.us-east-2.compute.amazonaws.com:5432/sephora')
query = 'SELECT * FROM sephora_product_review'
df = pd.read_sql_query(query, cnx)
df.head()

Unnamed: 0,product_id,review_title,review_text,rating,age_range,skin_type,skin_tone,eye_color,reviewer_username,tags
0,P102503,love it!,Thinks one of my favorite products. I love the...,5,,,,,angelesmichelle,
1,P102503,Great!,I started getting cystic acne when I turned 25...,5,,,,,Lady46,
2,P102503,Not for me,Didn't do a thing for me. I like my face to fe...,1,,combination,,,corol,[lightweight]
3,P102503,Wonderful moisturizer,I could see a wonderful difference in my very ...,5,,dry,,,Bamawoman,"[absorbs quickly, intensive, pleasant faint fr..."
4,P102503,Favorite Moisturizer ever,I have tried 100s Of different moisturizer and...,5,,,,,leighalbs,


In [7]:
review_docs = df['review_text'].as_matrix()
review_docs[:3]

array([ 'Thinks one of my favorite products. I love the way it makes my skin feel and how moisturizing it is. Good product and results all the way!',
       "I started getting cystic acne when I turned 25 and tried all kinds of products including prescriptions. This is the only moisturizer that works for me without causing breakouts or oily skin. It's a little pricy but a little goes a long way so it will last awhile.",
       "Didn't do a thing for me. I like my face to feel clean after I wash and that didn't happen with this product."], dtype=object)

In [8]:
count_vectorizer = CountVectorizer(ngram_range=(1, 3),  
                                   stop_words='english', token_pattern="\\b[a-z][a-z]+\\b")
count_vectorizer.fit(review_docs)

# Transpose it so the terms are the rows
counts = count_vectorizer.transform(review_docs).transpose()
corpus = matutils.Sparse2Corpus(counts)
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [9]:
lda = models.LdaModel(corpus=corpus, num_topics=6, id2word=id2word, passes=10)

2017-08-14 14:40:54,291 : INFO : using symmetric alpha at 0.16666666666666666
2017-08-14 14:40:54,292 : INFO : using symmetric eta at 3.50356820904e-07
2017-08-14 14:40:54,808 : INFO : using serial LDA version on this node
2017-08-14 14:43:10,165 : INFO : running online (multi-pass) LDA training, 6 topics, 10 passes over the supplied corpus of 111200 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2017-08-14 14:43:10,203 : INFO : PROGRESS: pass 0, at document #2000/111200
2017-08-14 14:43:16,893 : INFO : merging changes from 2000 documents into a model of 111200 documents
2017-08-14 14:43:21,709 : INFO : topic #3 (0.167): 0.010*"eye" + 0.010*"skin" + 0.007*"cream" + 0.005*"product" + 0.004*"eye cream" + 0.004*"use" + 0.003*"using" + 0.003*"ve" + 0.003*"love" + 0.003*"eyes"
2017-08-14 14:43:21,763 : INFO : topic #1 (0.167): 0.009*"skin" + 0.006*"product" + 0.006*"cream" + 0.005*"eyes

2017-08-14 14:44:38,296 : INFO : merging changes from 2000 documents into a model of 111200 documents
2017-08-14 14:44:49,991 : INFO : topic #0 (0.167): 0.015*"skin" + 0.005*"like" + 0.004*"face" + 0.004*"dry" + 0.003*"love" + 0.003*"use" + 0.003*"product" + 0.003*"really" + 0.002*"just" + 0.002*"feels"
2017-08-14 14:44:50,052 : INFO : topic #4 (0.167): 0.025*"skin" + 0.008*"acne" + 0.008*"product" + 0.006*"love" + 0.005*"oily" + 0.005*"moisturizer" + 0.004*"using" + 0.004*"dry" + 0.004*"cleanser" + 0.004*"sensitive"
2017-08-14 14:44:50,113 : INFO : topic #5 (0.167): 0.006*"face" + 0.005*"skin" + 0.004*"use" + 0.004*"cleanser" + 0.003*"just" + 0.003*"using" + 0.003*"used" + 0.002*"products" + 0.002*"like" + 0.002*"ve"
2017-08-14 14:44:50,174 : INFO : topic #3 (0.167): 0.009*"cream" + 0.009*"skin" + 0.005*"night" + 0.004*"product" + 0.004*"use" + 0.004*"using" + 0.004*"ve" + 0.003*"love" + 0.003*"eye" + 0.003*"moisturizer"
2017-08-14 14:44:50,241 : INFO : topic #2 (0.167): 0.005*"use" +

2017-08-14 14:47:04,765 : INFO : topic #2 (0.167): 0.005*"eye" + 0.005*"use" + 0.005*"little" + 0.005*"eyes" + 0.004*"makeup" + 0.004*"long" + 0.004*"way" + 0.004*"goes" + 0.003*"product" + 0.002*"dark"
2017-08-14 14:47:04,814 : INFO : topic #3 (0.167): 0.009*"cream" + 0.008*"skin" + 0.005*"eye" + 0.005*"use" + 0.004*"night" + 0.004*"using" + 0.004*"product" + 0.004*"ve" + 0.003*"serum" + 0.003*"love"
2017-08-14 14:47:04,866 : INFO : topic #0 (0.167): 0.013*"skin" + 0.006*"like" + 0.005*"face" + 0.005*"dry" + 0.003*"love" + 0.003*"sample" + 0.003*"feels" + 0.003*"mask" + 0.003*"just" + 0.003*"use"
2017-08-14 14:47:04,966 : INFO : topic diff=0.198748, rho=0.277350
2017-08-14 14:47:05,035 : INFO : PROGRESS: pass 0, at document #28000/111200
2017-08-14 14:47:09,878 : INFO : merging changes from 2000 documents into a model of 111200 documents
2017-08-14 14:47:24,450 : INFO : topic #2 (0.167): 0.007*"eye" + 0.006*"eyes" + 0.005*"little" + 0.005*"use" + 0.005*"long" + 0.004*"makeup" + 0.004*

2017-08-14 14:49:04,127 : INFO : topic diff=0.194870, rho=0.229416
2017-08-14 14:49:27,799 : INFO : -14.704 per-word bound, 26689.3 perplexity estimate based on a held-out corpus of 2000 documents with 166860 words
2017-08-14 14:49:27,800 : INFO : PROGRESS: pass 0, at document #40000/111200
2017-08-14 14:49:32,285 : INFO : merging changes from 2000 documents into a model of 111200 documents
2017-08-14 14:49:48,016 : INFO : topic #0 (0.167): 0.011*"skin" + 0.007*"like" + 0.006*"face" + 0.004*"sample" + 0.004*"dry" + 0.004*"love" + 0.003*"mask" + 0.003*"feels" + 0.003*"smell" + 0.003*"feeling"
2017-08-14 14:49:48,056 : INFO : topic #4 (0.167): 0.031*"skin" + 0.007*"oily" + 0.007*"product" + 0.006*"dry" + 0.006*"acne" + 0.006*"love" + 0.005*"moisturizer" + 0.005*"use" + 0.004*"using" + 0.004*"sensitive"
2017-08-14 14:49:48,108 : INFO : topic #5 (0.167): 0.006*"face" + 0.004*"just" + 0.003*"use" + 0.003*"used" + 0.002*"skin" + 0.002*"products" + 0.002*"bottle" + 0.002*"product" + 0.002*"us

2017-08-14 14:52:00,047 : INFO : topic #5 (0.167): 0.005*"face" + 0.004*"just" + 0.003*"used" + 0.003*"use" + 0.002*"bottle" + 0.002*"products" + 0.002*"try" + 0.002*"product" + 0.002*"skin" + 0.002*"using"
2017-08-14 14:52:00,100 : INFO : topic #1 (0.167): 0.014*"product" + 0.005*"like" + 0.005*"really" + 0.005*"did" + 0.004*"good" + 0.004*"didn" + 0.004*"skin" + 0.003*"does" + 0.003*"don" + 0.003*"feel"
2017-08-14 14:52:00,143 : INFO : topic #4 (0.167): 0.032*"skin" + 0.007*"oily" + 0.007*"dry" + 0.006*"product" + 0.006*"moisturizer" + 0.006*"love" + 0.006*"oil" + 0.006*"acne" + 0.005*"use" + 0.004*"sensitive"
2017-08-14 14:52:00,195 : INFO : topic #2 (0.167): 0.006*"eyes" + 0.006*"eye" + 0.006*"little" + 0.005*"long" + 0.004*"way" + 0.004*"use" + 0.004*"makeup" + 0.004*"goes" + 0.004*"dark" + 0.003*"circles"
2017-08-14 14:52:00,299 : INFO : topic diff=0.190349, rho=0.196116
2017-08-14 14:52:00,372 : INFO : PROGRESS: pass 0, at document #54000/111200
2017-08-14 14:52:04,623 : INFO : 

2017-08-14 14:54:43,177 : INFO : topic #0 (0.167): 0.011*"skin" + 0.008*"mask" + 0.008*"like" + 0.007*"face" + 0.006*"sample" + 0.004*"love" + 0.004*"feels" + 0.003*"got" + 0.003*"dry" + 0.003*"size"
2017-08-14 14:54:43,277 : INFO : topic diff=0.184758, rho=0.176777
2017-08-14 14:54:43,426 : INFO : PROGRESS: pass 0, at document #66000/111200
2017-08-14 14:54:47,430 : INFO : merging changes from 2000 documents into a model of 111200 documents
2017-08-14 14:55:06,538 : INFO : topic #5 (0.167): 0.005*"face" + 0.004*"just" + 0.003*"used" + 0.003*"use" + 0.002*"try" + 0.002*"products" + 0.002*"cleanser" + 0.002*"bottle" + 0.002*"time" + 0.002*"using"
2017-08-14 14:55:06,597 : INFO : topic #1 (0.167): 0.013*"product" + 0.005*"did" + 0.005*"like" + 0.005*"really" + 0.005*"didn" + 0.004*"good" + 0.003*"don" + 0.003*"skin" + 0.003*"does" + 0.003*"feel"
2017-08-14 14:55:06,644 : INFO : topic #0 (0.167): 0.010*"skin" + 0.008*"like" + 0.007*"face" + 0.007*"mask" + 0.006*"sample" + 0.004*"love" + 0

2017-08-14 14:57:35,789 : INFO : topic #3 (0.167): 0.008*"cream" + 0.007*"skin" + 0.006*"using" + 0.006*"ve" + 0.005*"night" + 0.005*"use" + 0.004*"serum" + 0.004*"product" + 0.003*"eye" + 0.003*"love"
2017-08-14 14:57:35,853 : INFO : topic #5 (0.167): 0.005*"face" + 0.004*"just" + 0.003*"used" + 0.002*"use" + 0.002*"try" + 0.002*"products" + 0.002*"bottle" + 0.002*"sephora" + 0.002*"time" + 0.002*"ve"
2017-08-14 14:57:35,900 : INFO : topic #4 (0.167): 0.034*"skin" + 0.008*"dry" + 0.007*"oily" + 0.006*"love" + 0.006*"use" + 0.006*"oil" + 0.005*"product" + 0.005*"moisturizer" + 0.005*"acne" + 0.004*"sensitive"
2017-08-14 14:57:35,956 : INFO : topic #0 (0.167): 0.010*"skin" + 0.009*"mask" + 0.008*"face" + 0.008*"like" + 0.006*"sample" + 0.005*"love" + 0.004*"feels" + 0.004*"got" + 0.003*"size" + 0.003*"feeling"
2017-08-14 14:57:36,059 : INFO : topic diff=0.204790, rho=0.160128
2017-08-14 14:58:00,733 : INFO : -14.408 per-word bound, 21745.5 perplexity estimate based on a held-out corpus 

2017-08-14 15:00:24,430 : INFO : topic #5 (0.167): 0.005*"face" + 0.003*"just" + 0.003*"used" + 0.003*"try" + 0.002*"products" + 0.002*"use" + 0.002*"bottle" + 0.002*"sephora" + 0.002*"time" + 0.002*"ve"
2017-08-14 15:00:24,494 : INFO : topic #0 (0.167): 0.010*"skin" + 0.008*"face" + 0.008*"like" + 0.007*"mask" + 0.006*"sample" + 0.005*"love" + 0.004*"feels" + 0.004*"got" + 0.004*"size" + 0.003*"feeling"
2017-08-14 15:00:24,593 : INFO : topic diff=0.188499, rho=0.149071
2017-08-14 15:00:24,661 : INFO : PROGRESS: pass 0, at document #92000/111200
2017-08-14 15:00:28,364 : INFO : merging changes from 2000 documents into a model of 111200 documents
2017-08-14 15:00:48,388 : INFO : topic #4 (0.167): 0.035*"skin" + 0.008*"dry" + 0.008*"oily" + 0.006*"love" + 0.006*"use" + 0.006*"moisturizer" + 0.006*"product" + 0.005*"acne" + 0.005*"oil" + 0.004*"face"
2017-08-14 15:00:48,462 : INFO : topic #5 (0.167): 0.005*"face" + 0.003*"just" + 0.003*"used" + 0.003*"try" + 0.002*"products" + 0.002*"use"

2017-08-14 15:03:26,626 : INFO : PROGRESS: pass 0, at document #104000/111200
2017-08-14 15:03:30,769 : INFO : merging changes from 2000 documents into a model of 111200 documents
2017-08-14 15:03:53,046 : INFO : topic #1 (0.167): 0.014*"product" + 0.006*"like" + 0.006*"really" + 0.005*"did" + 0.005*"didn" + 0.004*"don" + 0.004*"good" + 0.003*"does" + 0.003*"just" + 0.003*"think"
2017-08-14 15:03:53,110 : INFO : topic #5 (0.167): 0.004*"face" + 0.003*"just" + 0.003*"used" + 0.003*"try" + 0.002*"products" + 0.002*"sephora" + 0.002*"use" + 0.002*"bottle" + 0.002*"time" + 0.002*"tried"
2017-08-14 15:03:53,159 : INFO : topic #2 (0.167): 0.008*"makeup" + 0.006*"little" + 0.006*"eye" + 0.005*"long" + 0.005*"eyes" + 0.005*"way" + 0.004*"goes" + 0.004*"use" + 0.003*"long way" + 0.003*"goes long"
2017-08-14 15:03:53,214 : INFO : topic #4 (0.167): 0.035*"skin" + 0.008*"dry" + 0.008*"oily" + 0.006*"love" + 0.006*"product" + 0.006*"use" + 0.006*"moisturizer" + 0.005*"oil" + 0.004*"acne" + 0.004*"f

2017-08-14 15:06:41,367 : INFO : topic #4 (0.167): 0.034*"skin" + 0.008*"dry" + 0.007*"oily" + 0.006*"product" + 0.006*"love" + 0.006*"moisturizer" + 0.006*"use" + 0.005*"sensitive" + 0.005*"face" + 0.004*"acne"
2017-08-14 15:06:41,428 : INFO : topic #3 (0.167): 0.010*"cream" + 0.006*"using" + 0.006*"skin" + 0.006*"ve" + 0.005*"night" + 0.005*"use" + 0.004*"product" + 0.004*"eye" + 0.003*"lines" + 0.003*"eye cream"
2017-08-14 15:06:41,481 : INFO : topic #2 (0.167): 0.008*"makeup" + 0.007*"eye" + 0.007*"eyes" + 0.006*"little" + 0.005*"long" + 0.004*"way" + 0.004*"goes" + 0.004*"use" + 0.003*"dark" + 0.003*"circles"
2017-08-14 15:06:41,554 : INFO : topic #0 (0.167): 0.009*"skin" + 0.009*"face" + 0.008*"like" + 0.006*"mask" + 0.005*"love" + 0.005*"sample" + 0.004*"feels" + 0.004*"feeling" + 0.004*"got" + 0.003*"smells"
2017-08-14 15:06:41,662 : INFO : topic diff=0.150624, rho=0.131762
2017-08-14 15:06:41,723 : INFO : PROGRESS: pass 1, at document #6000/111200
2017-08-14 15:06:45,509 : INF

KeyboardInterrupt: 

In [None]:
lda.print_topics(num_words=20)