In [42]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import col, udf, regexp_replace, explode
from nltk.stem import WordNetLemmatizer

import seaborn as sns
import matplotlib.pyplot as plt

In [43]:
spark = SparkSession.builder \
    .appName("SPARK TRAB - QUESTION 2 A") \
    .getOrCreate()

In [44]:
df_orig = spark.read.json("./data/eiffel-tower-reviews.json")

In [45]:
df_orig.show(5)

+--------------------+--------------------+-----------+--------------------+------------+------------+--------------------+--------------------+
|                 _id|              author|bubbleCount|         collectedAt|   createdAt|       query|                text|               title|
+--------------------+--------------------+-----------+--------------------+------------+------------+--------------------+--------------------+
|{5921cdae4b679c46...|{0, 0, Since this...|         50|{2017-05-21T17:26...|May 20, 2017|Eiffel_Tower|This is the most ...|Must do even it w...|
|{5921cdae4b679c46...|{10, 4, Since Aug...|         50|{2017-05-21T17:26...|May 20, 2017|Eiffel_Tower|My significant ot...|           A Classic|
|{5921cdae4b679c46...|{9, 4, Since Nov ...|         50|{2017-05-21T17:26...|May 20, 2017|Eiffel_Tower|We had a tour to ...|         Wet weather|
|{5921cdae4b679c46...|{16, 5, Since Apr...|         50|{2017-05-21T17:26...|May 20, 2017|Eiffel_Tower|Visited with my w...|    Not

In [46]:
df_orig.count()

7179

In [47]:
df_orig.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- author: struct (nullable = true)
 |    |-- helpfulVotes: string (nullable = true)
 |    |-- level: string (nullable = true)
 |    |-- memberSince: string (nullable = true)
 |    |-- postForum: string (nullable = true)
 |    |-- ratings: string (nullable = true)
 |    |-- reviews: string (nullable = true)
 |-- bubbleCount: long (nullable = true)
 |-- collectedAt: struct (nullable = true)
 |    |-- $date: string (nullable = true)
 |-- createdAt: string (nullable = true)
 |-- query: string (nullable = true)
 |-- text: string (nullable = true)
 |-- title: string (nullable = true)


In [48]:
text = df_orig.select("text").dropna()

text.show()

+--------------------+
|                text|
+--------------------+
|This is the most ...|
|My significant ot...|
|We had a tour to ...|
|Visited with my w...|
|We went in the ni...|
|Dont hesitate and...|
|I enjoyed the tow...|
|Read through the ...|
|This by far was o...|
|Something you hav...|
|The views are bea...|
|Worth spending a ...|
|Took the tour to ...|
|A fantastic fusio...|
|Whatever you do i...|
|Not to miss..beau...|
|We visited in the...|
|Go for sunset and...|
|We booked weeks a...|
|Eiffel Tower is j...|
+--------------------+


In [49]:
text = text.withColumn("text_clean", regexp_replace(col("text"), "[^\w\s]", ""))

In [50]:
tokenizer = Tokenizer(inputCol="text_clean", outputCol="words")
text_tokens = tokenizer.transform(text)

text_tokens.show()

+--------------------+--------------------+--------------------+
|                text|          text_clean|               words|
+--------------------+--------------------+--------------------+
|This is the most ...|This is the most ...|[this, is, the, m...|
|My significant ot...|My significant ot...|[my, significant,...|
|We had a tour to ...|We had a tour to ...|[we, had, a, tour...|
|Visited with my w...|Visited with my w...|[visited, with, m...|
|We went in the ni...|We went in the ni...|[we, went, in, th...|
|Dont hesitate and...|Dont hesitate and...|[dont, hesitate, ...|
|I enjoyed the tow...|I enjoyed the tow...|[i, enjoyed, the,...|
|Read through the ...|Read through the ...|[read, through, t...|
|This by far was o...|This by far was o...|[this, by, far, w...|
|Something you hav...|Something you hav...|[something, you, ...|
|The views are bea...|The views are bea...|[the, views, are,...|
|Worth spending a ...|Worth spending a ...|[worth, spending,...|
|Took the tour to ...|Too

In [51]:
remover = StopWordsRemover(inputCol="words", outputCol="words_no_stop")
text_tokens_clean = remover.transform(text_tokens)

text_tokens_clean.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [52]:
def remove_empty_tokens(tokens):
    return [token for token in tokens if len(token) >= 1]

In [53]:
remove_empty_tokens_udf = udf(remove_empty_tokens, ArrayType(StringType()))

text_tokens_clean = text_tokens_clean.withColumn("words_clean", remove_empty_tokens_udf(col("words_no_stop")))

text_tokens_clean.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|          text_clean|               words|       words_no_stop|         words_clean|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|This is the most ...|This is the most ...|[this, is, the, m...|[busiest, atttact...|[busiest, atttact...|
|My significant ot...|My significant ot...|[my, significant,...|[significant, dru...|[significant, dru...|
|We had a tour to ...|We had a tour to ...|[we, had, a, tour...|[tour, eiffel, to...|[tour, eiffel, to...|
|Visited with my w...|Visited with my w...|[visited, with, m...|[visited, wife, son]|[visited, wife, son]|
|We went in the ni...|We went in the ni...|[we, went, in, th...|[went, night, 10p...|[went, night, 10p...|
|Dont hesitate and...|Dont hesitate and...|[dont, hesitate, ...|[dont, hesitate, ...|[dont, hesitate, ...|
|I enjoyed the tow...|I enjoyed the t

In [54]:
lemmatizer = WordNetLemmatizer()

def lemmatize(words):
    return [str(lemmatizer.lemmatize(word)) for word in words]

lemmatize_udf = udf(lemmatize, ArrayType(StringType(), True))

tokens_lemma = text_tokens_clean.withColumn("words_lemmatized", lemmatize_udf(col("words_clean")))
tokens_lemma.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [55]:
tokens_lemma.schema["words_lemmatized"].dataType

ArrayType(StringType(), True)

In [56]:
result = tokens_lemma.select("words_lemmatized").withColumn("word", explode("words_lemmatized")).select("word")                                                  
result.show()

+-----------+
|       word|
+-----------+
|    busiest|
| atttaction|
|      paris|
|       nice|
| restaurant|
|       view|
|spectacular|
|        top|
|       must|
|      paris|
|significant|
|  drunkenly|
|   stumbled|
|       week|
|       back|
|     cliche|
|       type|
| experience|
|  certainly|
| absolutely|
+-----------+


In [57]:
result_count = result.groupBy("word").count().orderBy(col("count").desc()).limit(25)

In [58]:
result_pd = result_count.toPandas()

result_pd

Unnamed: 0,word,count
0,tower,5929
1,paris,3421
2,eiffel,3386
3,view,3374
4,top,2830
5,go,2794
6,time,2427
7,see,2204
8,ticket,2090
9,get,1929


In [59]:
sns.set(style="darkgrid", rc={"axes.facecolor": "#252434", "figure.facecolor": "#202124", "axes.edgecolor": "#21212a", "grid.color": "#21212a"}) 

plt.figure(figsize=(10, 6), facecolor='#202124') 
barplot = sns.barplot(x='word', y='count', data=result_pd, hue='word', legend=False, palette='viridis', edgecolor = "#21212a", color = "white")

for p in barplot.patches:
    value = str(int(p.get_height()))
    barplot.annotate(value,
                     (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha='center', va='center',
                     xytext=(0, 10), textcoords='offset points', fontsize=8, rotation=30, color='#ffffff')

plt.xlabel('Palavra', color='#ffffff')  
plt.ylabel('Quantidade Observada', color='#ffffff')  
plt.title('Top 25 Palavras mais Utilizadas nos Reviews Torre Eiffel', color='#ffffff') 
plt.xticks(rotation=45,ha = "right", color='#ffffff')
plt.yticks(color='#ffffff') 

plt.savefig('./outcome/Q2/most_used_words.pdf', facecolor='#202124', bbox_inches='tight')
plt.close()

In [60]:
spark.stop()