In [15]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, NGram
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import col, udf, regexp_replace, explode
from nltk.stem import WordNetLemmatizer

import nltk

In [16]:
spark = SparkSession.builder \
    .appName("SPARK TRAB - QUESTION 2 A") \
    .getOrCreate()

In [17]:
df_orig = spark.read.json("./data/eiffel-tower-reviews.json")

In [18]:
df_orig.show()

+--------------------+--------------------+-----------+--------------------+------------+------------+--------------------+--------------------+
|                 _id|              author|bubbleCount|         collectedAt|   createdAt|       query|                text|               title|
+--------------------+--------------------+-----------+--------------------+------------+------------+--------------------+--------------------+
|{5921cdae4b679c46...|{0, 0, Since this...|         50|{2017-05-21T17:26...|May 20, 2017|Eiffel_Tower|This is the most ...|Must do even it w...|
|{5921cdae4b679c46...|{10, 4, Since Aug...|         50|{2017-05-21T17:26...|May 20, 2017|Eiffel_Tower|My significant ot...|           A Classic|
|{5921cdae4b679c46...|{9, 4, Since Nov ...|         50|{2017-05-21T17:26...|May 20, 2017|Eiffel_Tower|We had a tour to ...|         Wet weather|
|{5921cdae4b679c46...|{16, 5, Since Apr...|         50|{2017-05-21T17:26...|May 20, 2017|Eiffel_Tower|Visited with my w...|    Not

In [19]:
text = df_orig.select("text").dropna()

text.show()

+--------------------+
|                text|
+--------------------+
|This is the most ...|
|My significant ot...|
|We had a tour to ...|
|Visited with my w...|
|We went in the ni...|
|Dont hesitate and...|
|I enjoyed the tow...|
|Read through the ...|
|This by far was o...|
|Something you hav...|
|The views are bea...|
|Worth spending a ...|
|Took the tour to ...|
|A fantastic fusio...|
|Whatever you do i...|
|Not to miss..beau...|
|We visited in the...|
|Go for sunset and...|
|We booked weeks a...|
|Eiffel Tower is j...|
+--------------------+


In [20]:
text = text.withColumn("text_clean", regexp_replace(col("text"), "[^\w\s]", ""))

In [21]:
tokenizer = Tokenizer(inputCol="text_clean", outputCol="words")
text_tokens = tokenizer.transform(text)

text_tokens.show()

+--------------------+--------------------+--------------------+
|                text|          text_clean|               words|
+--------------------+--------------------+--------------------+
|This is the most ...|This is the most ...|[this, is, the, m...|
|My significant ot...|My significant ot...|[my, significant,...|
|We had a tour to ...|We had a tour to ...|[we, had, a, tour...|
|Visited with my w...|Visited with my w...|[visited, with, m...|
|We went in the ni...|We went in the ni...|[we, went, in, th...|
|Dont hesitate and...|Dont hesitate and...|[dont, hesitate, ...|
|I enjoyed the tow...|I enjoyed the tow...|[i, enjoyed, the,...|
|Read through the ...|Read through the ...|[read, through, t...|
|This by far was o...|This by far was o...|[this, by, far, w...|
|Something you hav...|Something you hav...|[something, you, ...|
|The views are bea...|The views are bea...|[the, views, are,...|
|Worth spending a ...|Worth spending a ...|[worth, spending,...|
|Took the tour to ...|Too

In [22]:
remover = StopWordsRemover(inputCol="words", outputCol="words_no_stop")
text_tokens_clean = remover.transform(text_tokens)

text_tokens_clean.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [23]:
def remove_empty_tokens(tokens):
    return [token for token in tokens if len(token) >= 1]

In [24]:
remove_empty_tokens_udf = udf(remove_empty_tokens, ArrayType(StringType()))

text_tokens_clean = text_tokens_clean.withColumn("words_clean", remove_empty_tokens_udf(col("words_no_stop")))

text_tokens_clean.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|          text_clean|               words|       words_no_stop|         words_clean|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|This is the most ...|This is the most ...|[this, is, the, m...|[busiest, atttact...|[busiest, atttact...|
|My significant ot...|My significant ot...|[my, significant,...|[significant, dru...|[significant, dru...|
|We had a tour to ...|We had a tour to ...|[we, had, a, tour...|[tour, eiffel, to...|[tour, eiffel, to...|
|Visited with my w...|Visited with my w...|[visited, with, m...|[visited, wife, son]|[visited, wife, son]|
|We went in the ni...|We went in the ni...|[we, went, in, th...|[went, night, 10p...|[went, night, 10p...|
|Dont hesitate and...|Dont hesitate and...|[dont, hesitate, ...|[dont, hesitate, ...|[dont, hesitate, ...|
|I enjoyed the tow...|I enjoyed the t

In [25]:
lemmatizer = WordNetLemmatizer()

def lemmatize(words):
    return [str(lemmatizer.lemmatize(word)) for word in words]

lemmatize_udf = udf(lemmatize, ArrayType(StringType(), True))

tokens_lemma = text_tokens_clean.withColumn("words_lemmatized", lemmatize_udf(col("words_clean")))
tokens_lemma.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [26]:
from functools import reduce

def generate_ngrams(wordsData, ngram_range):
    ngrams_data = []
    for n in range(ngram_range[0], ngram_range[1] + 1):
        ngram = NGram(n=n, inputCol="words_lemmatized", outputCol="expressions")
        ngram_data = ngram.transform(wordsData)
        ngrams_data.append(ngram_data)
    return reduce(lambda df1, df2: df1.union(df2), ngrams_data)

In [27]:
ngram_range = (2, 3)
tokens_lemma = generate_ngrams(tokens_lemma, ngram_range)

tokens_lemma.show() # OK

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|          text_clean|               words|       words_no_stop|         words_clean|    words_lemmatized|         expressions|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|This is the most ...|This is the most ...|[this, is, the, m...|[busiest, atttact...|[busiest, atttact...|[busiest, atttact...|[busiest atttacti...|
|My significant ot...|My significant ot...|[my, significant,...|[significant, dru...|[significant, dru...|[significant, dru...|[significant drun...|
|We had a tour to ...|We had a tour to ...|[we, had, a, tour...|[tour, eiffel, to...|[tour, eiffel, to...|[tour, eiffel, to...|[tour eiffel, eif...|
|Visited with my w...|Visited with my w...|[visited, with, m...|[visited, wife, son]|[visited, wife, son]|

In [28]:
result = tokens_lemma.select("expressions").withColumn("expression", explode("expressions")).select("expression")                                                  
result.show()

+--------------------+
|          expression|
+--------------------+
|  busiest atttaction|
|    atttaction paris|
|          paris nice|
|     nice restaurant|
|     restaurant view|
|    view spectacular|
|     spectacular top|
|            top must|
|          must paris|
|significant drunk...|
|  drunkenly stumbled|
|       stumbled week|
|           week back|
|         back cliche|
|         cliche type|
|     type experience|
|experience certainly|
|certainly absolutely|
|absolutely incred...|
|   incredible popped|
+--------------------+


In [30]:
result.groupBy("expression").count().orderBy(col("count").desc()).show()

+-------------+-----+
|   expression|count|
+-------------+-----+
| eiffel tower| 2930|
|     must see|  436|
|       go top|  392|
|   view paris|  361|
| second floor|  329|
|     view top|  305|
|   buy ticket|  296|
| second level|  283|
|   great view|  267|
|  visit paris|  265|
|   well worth|  259|
|    skip line|  245|
|    day night|  223|
|    long line|  217|
|    make sure|  214|
| visit eiffel|  214|
|   light show|  210|
|ticket online|  208|
|    view city|  207|
|   see eiffel|  200|
+-------------+-----+
