In [29]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, udf
from pyspark.sql.types import StringType, DoubleType
from textblob import TextBlob

import seaborn as sns
import matplotlib.pyplot as plt

In [30]:
spark = SparkSession.builder \
    .appName("SPARK TRAB - QUESTION 2 A") \
    .getOrCreate()

In [31]:
df_orig = spark.read.json("./data/eiffel-tower-reviews.json")

In [32]:
df_orig.show()

+--------------------+--------------------+-----------+--------------------+------------+------------+--------------------+--------------------+
|                 _id|              author|bubbleCount|         collectedAt|   createdAt|       query|                text|               title|
+--------------------+--------------------+-----------+--------------------+------------+------------+--------------------+--------------------+
|{5921cdae4b679c46...|{0, 0, Since this...|         50|{2017-05-21T17:26...|May 20, 2017|Eiffel_Tower|This is the most ...|Must do even it w...|
|{5921cdae4b679c46...|{10, 4, Since Aug...|         50|{2017-05-21T17:26...|May 20, 2017|Eiffel_Tower|My significant ot...|           A Classic|
|{5921cdae4b679c46...|{9, 4, Since Nov ...|         50|{2017-05-21T17:26...|May 20, 2017|Eiffel_Tower|We had a tour to ...|         Wet weather|
|{5921cdae4b679c46...|{16, 5, Since Apr...|         50|{2017-05-21T17:26...|May 20, 2017|Eiffel_Tower|Visited with my w...|    Not

In [33]:
text = df_orig.select("text").dropna()

text.show()

+--------------------+
|                text|
+--------------------+
|This is the most ...|
|My significant ot...|
|We had a tour to ...|
|Visited with my w...|
|We went in the ni...|
|Dont hesitate and...|
|I enjoyed the tow...|
|Read through the ...|
|This by far was o...|
|Something you hav...|
|The views are bea...|
|Worth spending a ...|
|Took the tour to ...|
|A fantastic fusio...|
|Whatever you do i...|
|Not to miss..beau...|
|We visited in the...|
|Go for sunset and...|
|We booked weeks a...|
|Eiffel Tower is j...|
+--------------------+


In [34]:
reviews_df = text.withColumn("clean_text", lower(col("text")))
reviews_df = reviews_df.withColumn("clean_text", regexp_replace(col("clean_text"), "[^a-zA-Z\\s]", ""))

In [35]:
def analyze_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

In [36]:
sentiment_udf = udf(analyze_sentiment, DoubleType())

sentiment_distribution = reviews_df.withColumn("sentiment", sentiment_udf(col("clean_text")))

sentiment_distribution.show()

+--------------------+--------------------+-------------------+
|                text|          clean_text|          sentiment|
+--------------------+--------------------+-------------------+
|This is the most ...|this is the most ...|               0.55|
|My significant ot...|my significant ot...| 0.2649350649350649|
|We had a tour to ...|we had a tour to ...|-0.2810185185185185|
|Visited with my w...|visited with my w...|                0.0|
|We went in the ni...|we went in the ni...|0.44142857142857145|
|Dont hesitate and...|dont hesitate and...|0.24285714285714288|
|I enjoyed the tow...|i enjoyed the tow...| 0.5700000000000001|
|Read through the ...|read through the ...| 0.4666666666666666|
|This by far was o...|this by far was o...|0.47500000000000003|
|Something you hav...|something you hav...|               0.85|
|The views are bea...|the views are bea...|              0.375|
|Worth spending a ...|worth spending a ...|0.19999999999999998|
|Took the tour to ...|took the tour to .

In [37]:
data = sentiment_distribution.toPandas()

In [38]:
sns.set(style="darkgrid", rc={"axes.facecolor": "#252434", "figure.facecolor": "#202124", "axes.edgecolor": "#21212a", "grid.color": "#21212a"}) 

plt.figure(figsize=(10, 6), facecolor='#202124') 
histplt = sns.histplot(data=data, x='sentiment', bins=25, kde=True, color='#eb0229', edgecolor='white', linewidth=1.5)  

plt.xlabel('Polaridade de Sentimento', color='#ffffff')  
plt.ylabel('Quantidade', color='#ffffff')  
plt.title('Distribuição de Sentimento das Reviews', color='#ffffff') 
plt.xticks(rotation=45, ha = "right",color='#ffffff')
plt.yticks(color='#ffffff') 

plt.savefig('./outcome/Q2/sentiment_distribution.pdf', facecolor='#202124', bbox_inches='tight')
plt.close()