# Amazon reviews LDA topic modeling

## The notebook:
- Set Spark session
- Working with Nulls
- Aggregates and Pivot

### Create Spark Session

In [1]:
from pyspark.sql import functions as F
try:
    spark.stop()
except:
    pass
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Amazon_review_LDA").enableHiveSupport().getOrCreate()

In [2]:
spark

## Import ML libraries

In [16]:
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import CountVectorizer, IDF,RegexTokenizer, Tokenizer
from pyspark.sql.types import ArrayType
from pyspark.sql.types import StringType
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql.functions import struct
import re
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import CountVectorizer

### Load Retail data set

In [3]:
df = spark.read.format("parquet")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("/home/jupyter/data/amazon/amazon_review_sample/*") 
df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: long (nullable = true)
 |-- year: integer (nullable = true)
 |-- product_category: string (nullable = true)



In [4]:
df.show(5)

+-----------+-----------+--------------+----------+--------------+--------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+----+--------------------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|year|    product_category|
+-----------+-----------+--------------+----------+--------------+--------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+----+--------------------+
|         US|   38691673|R3I7ZT0Q6PGMGI|B00F64A1YK|     725955684|The Last Hour of ...|          5|            1|          1|   N|                Y|   Her best book yet|I have read all o...|      16134|2014|Digital_Ebook_Pur...|
|         US|   51037852|R3H8UHZ6515551|B00GIZPLFK|     151617005|Kindle Fire HDX M.

## Create new DF with only narrative and unique ID

In [7]:
#from pyspark.sql.functions import monotonically_increasing_id, concat

df1 = df.withColumn('review_text', 
                    F.concat(F.col('review_headline'),F.lit(' '), F.col('review_body')))
corpus =df1.select('review_text')

# This will return a new DF with all the columns + id
corpus_df = corpus.withColumn("id", F.monotonically_increasing_id())

In [12]:
corpus_df.show(5)

+--------------------+---+
|         review_text| id|
+--------------------+---+
|Her best book yet...|  0|
|This book ... Doe...|  1|
|An interesting li...|  2|
|I loved it! I lov...|  3|
|Truly epic I had ...|  4|
+--------------------+---+
only showing top 5 rows



In [13]:
corpus_df.printSchema()

root
 |-- review_text: string (nullable = true)
 |-- id: long (nullable = false)



In [53]:
# Remove records with no review text
corpus_df = corpus_df.dropna()

## Tokenize Narrative text 

In [54]:
tokenizer = Tokenizer(inputCol="review_text", outputCol="words")
countTokens = udf(lambda words: len(words), IntegerType())
'''
tokenized_df = tokenizer.transform(corpus_df)
tokenized_df.select("review_text", "words").withColumn("tokens", countTokens(col("words"))).show() 
'''
regexTokenizer = RegexTokenizer(inputCol="review_text", 
                                outputCol="words",pattern="\\w+", gaps=False)
# alternatively, pattern="\\w+", gaps(False) pattern="\\W"

tokenized_df = regexTokenizer.transform(corpus_df)
tokenized_df.select("review_text", "words") \
    .withColumn("tokens", countTokens(F.col("words"))).show()

+--------------------+--------------------+------+
|         review_text|               words|tokens|
+--------------------+--------------------+------+
|Her best book yet...|[her, best, book,...|   111|
|This book ... Doe...|[this, book, does...|    28|
|An interesting li...|[an, interesting,...|    60|
|I loved it! I lov...|[i, loved, it, i,...|    82|
|Truly epic I had ...|[truly, epic, i, ...|   156|
|Wonderful! I love...|[wonderful, i, lo...|    57|
|Where's Harry? As...|[where, s, harry,...|   153|
|More than getting...|[more, than, gett...|   292|
|Love it The best ...|[love, it, the, b...|    20|
|This story will k...|[this, story, wil...|   241|
|Attention grabbin...|[attention, grabb...|   108|
|Memoir Revolution...|[memoir, revoluti...|   203|
|america's need th...|[america, s, need...|    48|
|5 paws! Such a gr...|[5, paws, such, a...|    58|
|Read about this b...|[read, about, thi...|    62|
|Hot unbelievable ...|[hot, unbelievabl...|    69|
|sooo gooooood Won...|[sooo, go

## Get stop words list

In [55]:
#Downloade stop words and store in the file /tmp/stopwords
#!wget http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words -O /tmp/stopwords
import requests
url = 'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words'
myfile = requests.get(url)    
stop_words = myfile.content.decode("utf-8").split("\r\n")
print('Downloaded first 10 stop words:',stop_words[0:10])

Downloaded first 10 stop words: ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost']


In [56]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
tokenized_df1 = remover.transform(tokenized_df)
tokenized_df1.show(5)

stopwordList = stop_words

remover=StopWordsRemover(inputCol="filtered", outputCol="filtered_more" ,stopWords=stopwordList)
tokenized_df2 = remover.transform(tokenized_df1)
tokenized_df2.show(5)

+--------------------+---+--------------------+--------------------+
|         review_text| id|               words|            filtered|
+--------------------+---+--------------------+--------------------+
|Her best book yet...|  0|[her, best, book,...|[best, book, yet,...|
|This book ... Doe...|  1|[this, book, does...|[book, tell, use,...|
|An interesting li...|  2|[an, interesting,...|[interesting, lit...|
|I loved it! I lov...|  3|[i, loved, it, i,...|[loved, loved, bo...|
|Truly epic I had ...|  4|[truly, epic, i, ...|[truly, epic, abs...|
+--------------------+---+--------------------+--------------------+
only showing top 5 rows

+--------------------+---+--------------------+--------------------+--------------------+
|         review_text| id|               words|            filtered|       filtered_more|
+--------------------+---+--------------------+--------------------+--------------------+
|Her best book yet...|  0|[her, best, book,...|[best, book, yet,...|[best, book, rea

## Vectorize (convert to numeric)

In [57]:
# Term Frequency Vectorization  - Option 2 (CountVectorizer)    : 
cv = CountVectorizer(inputCol="filtered_more", outputCol="features", vocabSize = 10000)
cvmodel = cv.fit(tokenized_df2)
featurized_df = cvmodel.transform(tokenized_df2)
vocab = cvmodel.vocabulary
featurized_df.select('filtered_more','features','id').show(5)

+--------------------+--------------------+---+
|       filtered_more|            features| id|
+--------------------+--------------------+---+
|[best, book, read...|(10000,[0,2,3,4,7...|  0|
|[book, tell, use,...|(10000,[0,67,69,8...|  1|
|[interesting, lit...|(10000,[4,14,21,2...|  2|
|[loved, loved, bo...|(10000,[0,3,4,17,...|  3|
|[truly, epic, abs...|(10000,[0,2,4,5,7...|  4|
+--------------------+--------------------+---+
only showing top 5 rows



## This is our DF to train LDA model on

In [58]:
countVectors = featurized_df.select('features','id')

## Train LDA model

In [59]:
#k=10 means 10 words per topic
lda = LDA(k=10, maxIter=10)
model = lda.fit(countVectors)

ll = model.logLikelihood(countVectors)
lp = model.logPerplexity(countVectors)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

# Shows the result
transformed = model.transform(countVectors)
transformed.show(truncate=False)

The lower bound on the log likelihood of the entire corpus: -9415097.165094886
The upper bound on perplexity: 7.79356241346436
The topics described by their top-weighted terms:
+-----+-----------+--------------------------------------------------------------------+
|topic|termIndices|termWeights                                                         |
+-----+-----------+--------------------------------------------------------------------+
|0    |[0, 1, 2]  |[0.008921981599321753, 0.004440095622265553, 0.004438178251706451]  |
|1    |[0, 1, 3]  |[0.018581561572791398, 0.010367201954230836, 0.005343683547872928]  |
|2    |[0, 139, 1]|[0.008450774123103138, 0.004781175204810333, 0.0036205423242618344] |
|3    |[0, 3, 9]  |[0.061831452792659974, 0.035647328442054066, 0.03273994431974831]   |
|4    |[2, 9, 0]  |[0.005858560244026486, 0.004829357566601088, 0.0034631763897836165] |
|5    |[9, 0, 3]  |[0.0036454282707969795, 0.0032155782587173234, 0.002606297779518974]|
|6    |[1, 0, 2]  |[0.

## Display words for top 10 topics

In [60]:
topics = model.describeTopics()   
topics_rdd = topics.rdd

topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()

for idx, topic in enumerate(topics_words):
    print ("topic: ", idx)
    print ("----------")
    for word in topic:
       print (word)
    print ("----------")

topic:  0
----------
book
br
read
p
good
reading
story
history
great
war
----------
topic:  1
----------
book
br
great
read
like
la
author
que
34
y
----------
topic:  2
----------
book
recipes
br
stars
diet
weight
read
34
low
really
----------
topic:  3
----------
book
great
stars
read
love
good
books
loved
series
recommend
----------
topic:  4
----------
read
stars
book
great
love
emmet
books
really
jeremey
story
----------
topic:  5
----------
stars
book
great
love
story
read
stolen
34
small
like
----------
topic:  6
----------
br
book
read
story
34
good
love
like
great
really
----------
topic:  7
----------
book
br
read
great
story
like
interesting
love
time
really
----------
topic:  8
----------
book
read
good
test
br
great
questions
like
time
stars
----------
topic:  9
----------
read
book
34
story
br
good
shakespeare
characters
f
lizzy
----------
