In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pyspark.sql.types import  *
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF, QuantileDiscretizer 
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.clustering import LDA, LDAModel

In [3]:
UDF_docCleaner= udf(lambda doc: " ".join(re.sub('[^0-9a-zA-Z]+', ' ', doc).
                                             lower().
                                             split()
                                        ), StringType() 
                   )

In [4]:
corpus = spark.read.\
    option("sep", ",").\
    option("header", "true").\
    option("inferSchema", True).\
    csv("gs://manualrg-formacion/hot_rev/data/Hotels_Reviews.csv").\
    withColumn('doc', UDF_docCleaner('review')).persist()

In [5]:
#Feature Engineering
regexTokenizer = RegexTokenizer().setInputCol("doc").setOutputCol("tokens").\
    setPattern("\\s+").\
    setMinTokenLength(2)
remover = StopWordsRemover().setInputCol("tokens").setOutputCol("tokens_rm").\
    setCaseSensitive(False)
TF = CountVectorizer().setInputCol("tokens_rm").setOutputCol("rawFeatures").\
    setMinTF(10).\
    setVocabSize(1000)
idf = IDF().setInputCol("rawFeatures").setOutputCol("features")

stages = [regexTokenizer, remover, TF, idf]
feat_eng_pl = Pipeline().setStages(stages).fit(corpus)
docTerm_df = feat_eng_pl.transform(corpus)
corpus.unpersist()
docTerm_df.persist()

DataFrame[Hotel_Address: string, Additional_Number_of_Scoring: int, Review_Date: string, Average_Score: double, Hotel_Name: string, Reviewer_Nationality: string, Negative_Review: string, Review_Total_Negative_Word_Count: int, Total_Number_of_Reviews: int, Positive_Review: string, Review_Total_Positive_Word_Count: int, Total_Number_of_Reviews_Reviewer: int, Reviewer_Score: int, Tags: string, days_since_review_old: string, lat: string, lng: string, id: int, idhotel: int, hotel_country: string, review: string, label: int, flg_same_country: int, days_since_review: int, doc: string, tokens: array<string>, tokens_rm: array<string>, rawFeatures: vector, features: vector]

In [58]:
TFModel = feat_eng_pl.stages[2]
vocabulary = map(lambda term: (term.encode("utf-8")), TFModel.vocabulary)

print("Vocabulary size: ", len(vocabulary))
print("Vocabulary: ", vocabulary[:10])

('Vocabulary size: ', 1000)
('Vocabulary: ', ['room', 'staff', 'location', 'hotel', 'breakfast', 'good', 'negative', 'great', 'friendly', 'helpful'])


In [65]:

voc_rdd = sc.parallelize(vocabulary)
voc_rdd.take(5)

['room', 'staff', 'location', 'hotel', 'breakfast']

In [12]:
lda = LDA().setK(3).setMaxIter(30).setFeaturesCol("features").setTopicDistributionCol("topicDistribution")
ldaModel = lda.fit(docTerm_df) 
topicsTop_df = ldaModel.describeTopics(3)

topicsTop_df.show()

+-----+-------------+--------------------+
|topic|  termIndices|         termWeights|
+-----+-------------+--------------------+
|    0| [4, 434, 23]|[0.14009192479538...|
|    1|[101, 5, 196]|[0.10073106357052...|
|    2|    [0, 3, 5]|[0.66271101588303...|
+-----+-------------+--------------------+



**Label topics with the most relevant words**

In [83]:
topicTokens = topicsTop_df.select(col("topic"), posexplode(col("termIndices")), col("termWeights")).\
  withColumnRenamed("col","tokenIdx").\
  rdd.map(lambda row: (row['topic'], row['pos'], row['tokenIdx'], vocabulary[row['tokenIdx']], row['termWeights'][row['pos']]) )
  #withColumn("vocabulary",lit(vocabulary))#.select(col("topic"),col("pos"),expr("vocabulary[tokenIdx] as token"))

  
  
topicTokens.take(5)

[(0, 0, 4, 'breakfast', 0.14009192479538307),
 (0, 1, 434, 'call', 0.12440970784770464),
 (0, 2, 23, 'us', 0.0889051318310744),
 (1, 0, 101, 'booking', 0.10073106357052276),
 (1, 1, 5, 'good', 0.09415103514559006)]

In [88]:
topicSchema= StructType().\
  add(StructField("topicLabel", IntegerType(), True)).\
  add(StructField("tokenPos", IntegerType(), True)).\
  add(StructField("tokenIdx", IntegerType(), True)).\
  add(StructField("token", StringType(), True)).\
  add(StructField("tokenWeight", DoubleType(), True))

In [89]:
topicReport =  topicTokens.toDF(topicSchema)
print("Topic labelling: ")
topicReport.show()

Topic labelling: 
+----------+--------+--------+---------+--------------------+
|topicLabel|tokenPos|tokenIdx|    token|         tokenWeight|
+----------+--------+--------+---------+--------------------+
|         0|       0|       4|breakfast| 0.14009192479538307|
|         0|       1|     434|     call| 0.12440970784770464|
|         0|       2|      23|       us|  0.0889051318310744|
|         1|       0|     101|  booking| 0.10073106357052276|
|         1|       1|       5|     good| 0.09415103514559006|
|         1|       2|     196|  bedroom| 0.08415154954214422|
|         2|       0|       0|     room|  0.6627110158830316|
|         2|       1|       3|    hotel| 0.27066285448844307|
|         2|       2|       5|     good|0.002038450048880536|
+----------+--------+--------+---------+--------------------+

