In [43]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, substring, to_timestamp, hour

import seaborn as sns
import matplotlib.pyplot as plt
import re

In [44]:
spark = SparkSession.builder \
    .appName("SPARK TRAB - QUESTION 1 C") \
    .getOrCreate()

In [45]:
df_orig = spark.read.option("header", "false").option("delimiter", "\t").csv("./data/debate-tweets.tsv")

## Objetivo
O objetivo desse Notebook é, a partir dos dados de tweets brasileiros de 2014 durante o período de eleições, responder a seguinte pergunta:

"Qual o número de tweets por hora a cada dia?"

In [46]:
df = df_orig.select("_c0", "_c1", "_c7")

df = df.withColumnRenamed("_c0", "id") \
                     .withColumnRenamed("_c1", "content") \
                     .withColumnRenamed("_c7", "day_and_hour_string")

df.show()

+------------------+--------------------+--------------------+
|                id|             content| day_and_hour_string|
+------------------+--------------------+--------------------+
|522394422710136832|@anacddd verdade,...|Wed Oct 15 14:31:...|
|522394422806581248|              Que ñ*|Wed Oct 15 14:31:...|
|522394422731100160| Vou quebrar a Bruna|Wed Oct 15 14:31:...|
|522394422810783745|agora vou p segun...|Wed Oct 15 14:31:...|
|522394423137943553|Me sinto tão bem ...|Wed Oct 15 14:31:...|
|522394423188271104|Eu estou aqui, de...|Wed Oct 15 14:31:...|
|522394423238606848|Quando vai embora...|Wed Oct 15 14:31:...|
|522394423528022016|@paynecaralhudo k...|Wed Oct 15 14:31:...|
|522394423632875521|Conceição da Barr...|Wed Oct 15 14:31:...|
|522394424010362881| @Maniavato te amo ♥|Wed Oct 15 14:31:...|
|522394424048091138|Alg me curtindo rs ♡|Wed Oct 15 14:31:...|
|522394424010358784|@MiiluAA No, porq...|Wed Oct 15 14:31:...|
|522394423741906944|#EMABiggestFansJu...|Wed Oct 15 14:

In [47]:
df = df.withColumn("hour_of_day_string", substring(df["day_and_hour_string"], 12, 8))
# Não consegui fazer a conversão direta para timestamp então capturei apenas a 
df = df.withColumn("timestamp_col", to_timestamp(df["hour_of_day_string"], "HH:mm:ss"))

df = df.withColumn("hour_of_day", hour(df["timestamp_col"]))

df = df.select("content", "timestamp_col", "hour_of_day")

df.show()

+--------------------+-------------------+-----------+
|             content|      timestamp_col|hour_of_day|
+--------------------+-------------------+-----------+
|@anacddd verdade,...|1970-01-01 14:31:50|         14|
|              Que ñ*|1970-01-01 14:31:50|         14|
| Vou quebrar a Bruna|1970-01-01 14:31:50|         14|
|agora vou p segun...|1970-01-01 14:31:50|         14|
|Me sinto tão bem ...|1970-01-01 14:31:50|         14|
|Eu estou aqui, de...|1970-01-01 14:31:50|         14|
|Quando vai embora...|1970-01-01 14:31:50|         14|
|@paynecaralhudo k...|1970-01-01 14:31:50|         14|
|Conceição da Barr...|1970-01-01 14:31:50|         14|
| @Maniavato te amo ♥|1970-01-01 14:31:50|         14|
|Alg me curtindo rs ♡|1970-01-01 14:31:50|         14|
|@MiiluAA No, porq...|1970-01-01 14:31:50|         14|
|#EMABiggestFansJu...|1970-01-01 14:31:50|         14|
|@raizabatista dev...|1970-01-01 14:31:51|         14|
|Me senti ate d fe...|1970-01-01 14:31:51|         14|
|qual o se

In [48]:
tweets_per_hour = df.select("content", "hour_of_day").groupBy("hour_of_day").count().dropna()

tweets_per_hour.show()

+-----------+------+
|hour_of_day| count|
+-----------+------+
|         16|381171|
|         15|381868|
|         17|357316|
|         14|308855|
|         18|337544|
|         22|390701|
|         20|335181|
|         19|320465|
|         23|408723|
|         21|358587|
|          1|582339|
|          2|608575|
|          0|470279|
|          3|492956|
|          5|207486|
|          4|331966|
|         12|176828|
|         13|230080|
|          6|125781|
|          9| 69421|
+-----------+------+


In [49]:
tweets_per_hour_pd = tweets_per_hour.toPandas()

In [50]:
# Configurar o estilo do Seaborn e Matplotlib para o modo noturno
sns.set(style="darkgrid", rc={"axes.facecolor": "#252434", "figure.facecolor": "#202124", "grid.color": "#37474f", "axes.edgecolor": "#202124"})
plt.figure(figsize=(10, 6), facecolor='#202124')

# Criar o gráfico de linhas
sns.lineplot(x='hour_of_day', y='count', data=tweets_per_hour_pd, marker='o', color='#ff3d2b', linewidth=2.5)

for x, y in zip(tweets_per_hour_pd['hour_of_day'], tweets_per_hour_pd['count']):
    plt.annotate(f'{y/1000:.1f}K', (x, y), textcoords="offset points", xytext=(0,10), ha='center', color='white', fontsize=8, rotation=20)

# Configurar os rótulos e o título do gráfico
plt.xlabel('Hora do Dia', color='#ffffff')
plt.ylabel('Número de Tweets', color='#ffffff')
plt.title('Número de Tweets por Hora do Dia - 15 a 20 de Outubro de 2014', color='#ffffff')
plt.xticks(range(0, 24), color='#ffffff')
plt.yticks(color='#ffffff')

# Exibir o gráfico
plt.savefig('./outcome/Q1/number_tweets_per_hour.pdf', facecolor='#202124', bbox_inches='tight')
plt.close()

In [51]:
spark.stop()