In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp, substring, hour, when, udf, size, col, explode
from pyspark.sql.types import ArrayType, StringType

import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
spark = SparkSession.builder \
    .appName("SPARK TRAB - QUESTION 1 A") \
    .getOrCreate()

In [3]:
df_orig = spark.read.option("header", "false").option("delimiter", "\t").csv("./data/debate-tweets.tsv")

## Objetivo
O objetivo desse Notebook é, a partir dos dados de tweets brasileiros de 2014 durante o período de eleições, responder a seguinte pergunta:

"Quais foram as hashtags mais usadas pela manhã, tarde e noite?"

In [4]:
# Para responder essa pergunta, necessariamente precisarei de apenas 2 colunas
# sendo elas "hora" e "conteúdo", respectivamente _c7 e _c1

df = df_orig.select("_c0", "_c1", "_c7") # mantive _c0 por ser uma coluna de ID

df = df.withColumnRenamed("_c0", "id") \
                     .withColumnRenamed("_c1", "content") \
                     .withColumnRenamed("_c7", "day_and_hour_string")

In [5]:
# O Formato da coluna "day_and_hour_string" está no formato abaixo:
# Mon May 09 00:12:02 +0000 2011
# Como não consegui converter diretamente para timestamp vou pegar por substring e dps converter

df = (df.withColumn("hour_of_day_string", 
                       substring(df["day_and_hour_string"], 12, 8)))

df = (df.withColumn("timestamp_col", 
                      to_timestamp(df["hour_of_day_string"], "HH:mm:ss")))

In [6]:
df = df.withColumn("hour_of_day", hour(df["timestamp_col"]))

df = df.select("id", "content", "hour_of_day")

# Aqui eu seleciono o período separando em três classes, manhã, tarde e noite
df = df.withColumn("period", when((df.hour_of_day >= 5) & (df.hour_of_day < 12), "morning")
                   .otherwise(when((df.hour_of_day >= 12) & (df.hour_of_day < 18), "afternoon")
                   .otherwise("night")))

In [7]:
# Função simples responsáveis por extrair hashtags utilizando expressão regulares

def extract_hashtags(text):
    return re.findall(r'#(\w+)', text)

In [8]:
# passa a função para o formato que pode ser utilizado no df do spark
extract_hashtags_udf = udf(extract_hashtags, ArrayType(StringType()))

# retirando as hashtags
df_hashtags = df.withColumn("hashtags", extract_hashtags_udf(df["content"]))

hashtags_and_periods = df_hashtags.select("hashtags", "period")
hashtags_and_periods = hashtags_and_periods.filter(size(col("hashtags")) > 0) 
# pegando apenas tweets com hashtags

hashtags_and_periods = hashtags_and_periods.withColumn("hashtag", explode("hashtags"))
hashtags_and_periods = hashtags_and_periods.select("hashtag", "period")

hashtags_and_periods = hashtags_and_periods.groupBy("period", "hashtag").count()

hashtags_and_periods = hashtags_and_periods.orderBy(col("count").desc())

hashtags_and_periods.show()

+---------+--------------------+------+
|   period|             hashtag| count|
+---------+--------------------+------+
|    night|    EMABiggestFans1D|137103|
|    night|EMABiggestFansJus...|135841|
|afternoon|    EMABiggestFans1D| 58868|
|afternoon|EMABiggestFansJus...| 51589|
|  morning|EMABiggestFansJus...| 20670|
|  morning|    EMABiggestFans1D| 19474|
|    night|        camilasayshi| 10485|
|    night|         StealMyGirl|  5161|
|afternoon|          QueroNoTVZ|  4788|
|    night|    bigpaynodanceoff|  4342|
|    night|         DebateNoSBT|  3417|
|    night|     CartersNewVideo|  3207|
|    night|           Vote5HEMA|  2936|
|    night|      TheVoiceBrasil|  2823|
|    night|LuanSantanaNaHora...|  2724|
|    night|         AssistamODR|  2545|
|    night|      DebateNaRecord|  2497|
|    night|              trndnl|  2355|
|    night|   AustinMahoneChile|  2177|
|    night|     QueroDilmaTreze|  2137|
+---------+--------------------+------+


In [9]:
# Separar tweets em diferentes datasets por periodo do dia
top_20_hashtags_morning = hashtags_and_periods.filter(col("period") == "morning").limit(25)

top_20_hashtags_afternoon = hashtags_and_periods.filter(col("period") == "afternoon").limit(25)

top_20_hashtags_night = hashtags_and_periods.filter(col("period") == "night").limit(25)

Por enquanto vou manter os resultados sem remover nenhuma hashtag, idealmente vai ser dar algum jeito de remover essas hashtags que não tem nada a ver com eleição para melhorar a visualização, no entanto isso não parece tão simples, então fica assim.

In [10]:
hashtags_morning_pandas = top_20_hashtags_morning.toPandas()

hashtags_afternoon_pandas = top_20_hashtags_afternoon.toPandas()

hashtags_night_pandas = top_20_hashtags_night.toPandas()

In [11]:
sns.set(style="darkgrid", rc={"axes.facecolor": "#252434", "figure.facecolor": "#202124", "axes.edgecolor": "#21212a", "grid.color": "#21212a"}) 

plt.figure(figsize=(10, 6), facecolor='#202124') 
barplot = sns.barplot(x='hashtag', y='count', data=hashtags_morning_pandas, hue='hashtag', legend=False, palette='viridis', edgecolor = "#21212a", color = "white")

# Adicionar os valores em cima das barras, divididos por 1000
for p in barplot.patches:
    barplot.annotate(format(p.get_height() / 1000, '.1f') + "K",
                     (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha='center', va='center',
                     xytext=(0, 10), textcoords='offset points', fontsize=8, rotation=30, color='#ffffff')

plt.xlabel('Hashtag', color='#ffffff')  
plt.ylabel('Quantidade', color='#ffffff')  
plt.title('Top 25 Hashtags - Período da Manhã', color='#ffffff') 
plt.xticks(rotation=90, color='#ffffff')
plt.yticks(color='#ffffff') 


plt.savefig('./outcome/Q1/top_hashtags_manha.pdf', facecolor='#202124', bbox_inches='tight')
plt.close()

  plt.savefig('./outcome/Q1/top_hashtags_manha.pdf', facecolor='#202124', bbox_inches='tight')
  plt.savefig('./outcome/Q1/top_hashtags_manha.pdf', facecolor='#202124', bbox_inches='tight')


In [12]:
sns.set(style="darkgrid", rc={"axes.facecolor": "#252434", "figure.facecolor": "#202124", "axes.edgecolor": "#21212a", "grid.color": "#21212a"}) 

plt.figure(figsize=(10, 6), facecolor='#202124') 
barplot = sns.barplot(x='hashtag', y='count', data=hashtags_afternoon_pandas, hue='hashtag', legend=False, palette='viridis', edgecolor = "#21212a", color = "white")

# Adicionar os valores em cima das barras, divididos por 1000
for p in barplot.patches:
    barplot.annotate(format(p.get_height() / 1000, '.1f') + "K",
                     (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha='center', va='center',
                     xytext=(0, 10), textcoords='offset points', fontsize=8, rotation=30, color='#ffffff')

plt.xlabel('Hashtag', color='#ffffff')  
plt.ylabel('Quantidade', color='#ffffff')  
plt.title('Top 25 Hashtags - Período da Tarde', color='#ffffff') 
plt.xticks(rotation=90, color='#ffffff')
plt.yticks(color='#ffffff') 


plt.savefig('./outcome/Q1/top_hashtags_tarde.pdf', facecolor='#202124', bbox_inches='tight')
plt.close()

In [13]:
sns.set(style="darkgrid", rc={"axes.facecolor": "#252434", "figure.facecolor": "#202124", "axes.edgecolor": "#21212a", "grid.color": "#21212a"}) 

plt.figure(figsize=(10, 6), facecolor='#202124') 
barplot = sns.barplot(x='hashtag', y='count', data=hashtags_night_pandas, hue='hashtag', legend=False, palette='viridis', edgecolor = "#21212a", color = "white")

# Adicionar os valores em cima das barras, divididos por 1000
for p in barplot.patches:
    barplot.annotate(format(p.get_height() / 1000, '.1f') + "K",
                     (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha='center', va='center',
                     xytext=(0, 10), textcoords='offset points', fontsize=8, rotation=30, color='#ffffff')

plt.xlabel('Hashtag', color='#ffffff')  
plt.ylabel('Quantidade', color='#ffffff')  
plt.title('Top 25 Hashtags - Período da Noite', color='#ffffff') 
plt.xticks(rotation=90, color='#ffffff')
plt.yticks(color='#ffffff') 


plt.savefig('./outcome/Q1/top_hashtags_noite.pdf', facecolor='#202124', bbox_inches='tight')
plt.close()

In [14]:
spark.stop()