In [27]:
!pip install pyspark



In [28]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [29]:
spark = SparkSession.builder.getOrCreate()

In [30]:
# Leia o arquivo 'videos-preparados.snappy.parquet' no dataframe 'df_video'

df_video = spark.read.option('header', 'true').parquet('videos-preparados.snappy.parquet')

In [31]:
# Calcule a quantidade de registros para cada valor único da coluna "Keyword"

df_keywords_count = df_video.groupBy("Keyword").count()

df_keywords_count.show()

+----------------+-----+
|         Keyword|count|
+----------------+-----+
|computer science|   48|
|            lofi|   40|
|         finance|   39|
|             cnn|   50|
|           apple|   42|
|            news|   39|
|         mukbang|   45|
|       education|   24|
|       interview|   50|
|          crypto|   50|
|   mathchemistry|   15|
|            food|   48|
|    data science|   50|
|        trolling|   50|
|        tutorial|   50|
|      literature|   46|
|             sat|   49|
|         history|   49|
|           cubes|   49|
|           music|   46|
+----------------+-----+
only showing top 20 rows


In [32]:
# Calcule a média da coluna "Interaction" para cada valor único da coluna 'Keyword'

from pyspark.sql.functions import mean

df_keyword_mean = df_video.groupBy("Keyword").agg(mean("Interaction").alias("media_interaction"))

df_keyword_mean.show()

+----------------+--------------------+
|         Keyword|   media_interaction|
+----------------+--------------------+
|computer science|  1226793.0208333333|
|            lofi|         4167085.875|
|         finance|   708542.9487179487|
|             cnn|           570650.86|
|           apple|1.0873628214285715E7|
|            news|  251688.71794871794|
|         mukbang|1.1053630377777778E7|
|       education|         2750838.625|
|       interview|          3044867.04|
|          crypto|            413676.2|
|   mathchemistry|  3427342.7333333334|
|            food|   5352944.104166667|
|    data science|           562465.28|
|        trolling|          1484584.88|
|        tutorial|           6936688.3|
|      literature|            881726.5|
|             sat|           1098927.0|
|         history| 1.565269257142857E7|
|           cubes|1.5043961224489795E7|
|           music|2.9691370304347824E7|
+----------------+--------------------+
only showing top 20 rows


In [33]:
# Calcule o valor máximo da coluna "Interaction" para cada valor único da coluna 'Keyword' e nomeie de 'Rank Interactions', em seguida ordene pela nova coluna em ordem decrescente

from pyspark.sql.functions import max

# Agrupa por 'Keyword' e calcula o máximo de 'Interaction', nomeando a coluna como 'Rank Interactions'
df_max_interaction = df_video.groupBy("Keyword").agg(
    max("Interaction").alias("Rank Interactions")
)

# Ordena pela coluna 'Rank Interactions' em ordem decrescente
df_max_interaction_sorted = df_max_interaction.orderBy(
    "Rank Interactions", ascending=False
)

df_max_interaction_sorted.show()

+--------+-----------------+
| Keyword|Rank Interactions|
+--------+-----------------+
| animals|       1593623628|
|   music|        922551152|
|     bed|        532691631|
| history|        440187490|
|   apple|        429916936|
| mrbeast|        300397699|
|  google|        239385460|
|business|        210025196|
|   cubes|        170925917|
|  sports|        106924567|
| mukbang|         87433858|
|    lofi|         86445177|
|tutorial|         69616442|
|  movies|         65253870|
|  marvel|         56247330|
|  how-to|         53053975|
|    food|         48754479|
| physics|         43463298|
|    asmr|         34411125|
|nintendo|         32268486|
+--------+-----------------+
only showing top 20 rows


In [34]:
# Calcule a média e a variância da coluna 'Views' para cada valor único da coluna 'Keyword'

from pyspark.sql.functions import mean, variance

# Agrupa por 'Keyword' e calcula média e variância de 'Views'
df_views_stats = df_video.groupBy("Keyword").agg(
    mean("Views").alias("Views_mean"),
    variance("Views").alias("Views_variance")
)

df_views_stats.show()

+----------------+--------------------+--------------------+
|         Keyword|          Views_mean|      Views_variance|
+----------------+--------------------+--------------------+
|computer science|  1191958.7083333333| 2.81219868165102E12|
|            lofi|           4089363.0|1.846209641476677...|
|         finance|   694223.4358974359|3.304483175097042...|
|             cnn|           554240.38|1.563423618468118...|
|           apple|1.0746930452380951E7|4.299927977442589E15|
|            news|   247492.1794871795|1.067512576672564...|
|         mukbang|1.0904772355555555E7|5.586073238973179...|
|       education|  2684432.8333333335|1.833572249339214...|
|       interview|          2966111.86|1.819220996034335E13|
|          crypto|           404608.22|3.513691634369074E12|
|   mathchemistry|  3328125.2666666666|2.491467065256849...|
|            food|          5252406.25|7.326374128154842E13|
|    data science|           544771.98|5.479336525349994...|
|        trolling|      

In [35]:
# Calcule a média, o valor mínimo e o valor máximo de 'Views' para cada valor único da coluna 'Keyword', sem casas decimais

from pyspark.sql.functions import mean, min, max, round

# Agrupa por 'Keyword' e calcula média, mínimo e máximo de 'Views', arredondando os resultados para zero casas decimais
df_views_stats = df_video.groupBy("Keyword").agg(
    round(mean("Views"), 0).alias("Views_mean"),
    round(min("Views"), 0).alias("Views_min"),
    round(max("Views"), 0).alias("Views_max")
)

df_views_stats.show()

+----------------+-----------+---------+---------+
|         Keyword| Views_mean|Views_min|Views_max|
+----------------+-----------+---------+---------+
|computer science|  1191959.0|    16115|  7004107|
|            lofi|  4089363.0|     6817| 84747957|
|         finance|   694223.0|     1195|  9450554|
|             cnn|   554240.0|    51269|  1889320|
|           apple| 1.074693E7|     1954|425478119|
|            news|   247492.0|    10642|  1465011|
|         mukbang|1.0904772E7|     3618| 86169225|
|       education|  2684433.0|     6611| 17103736|
|       interview|  2966112.0|     2587| 22529756|
|          crypto|   404608.0|     1599| 11805668|
|   mathchemistry|  3328125.0|       25| 18496859|
|            food|  5252406.0|    47430| 48018833|
|    data science|   544772.0|      911|  3069097|
|        trolling|  1420141.0|     5388| 14286302|
|        tutorial|  6761032.0|    19323| 68512549|
|      literature|   863021.0|     2847|  4231789|
|             sat|  1065869.0| 

In [36]:
# Mostre o primeiro e o último 'Published At' para cada valor único da coluna 'Keyword'

from pyspark.sql.functions import first, last

# Agrupa por 'Keyword' e obtém o primeiro e o último 'Published At'
df_published_range = df_video.groupBy("Keyword").agg(
    first("Published At").alias("First_Published_At"),
    last("Published At").alias("Last_Published_At")
)

# Ordena pelo campo 'Published At' (crescente)
df_sorted = df_video.orderBy("Published At")

# Agora aplica o groupBy normalmente usando o df_sorted
df_published_range = df_sorted.groupBy("Keyword").agg(
    first("Published At").alias("First_Published_At"),
    last("Published At").alias("Last_Published_At")
)

df_published_range.show()

+----------------+------------------+-----------------+
|         Keyword|First_Published_At|Last_Published_At|
+----------------+------------------+-----------------+
|computer science|        2009-08-20|       2022-08-12|
|            lofi|        2019-12-08|       2022-08-24|
|         finance|        2012-11-27|       2022-08-24|
|             cnn|        2022-07-14|       2022-08-24|
|           apple|        2016-11-02|       2022-08-24|
|            news|        2022-08-18|       2022-08-24|
|       education|        2008-07-25|       2022-08-24|
|         mukbang|        2020-02-29|       2022-08-24|
|       interview|        2016-01-05|       2022-08-24|
|          crypto|        2022-03-11|       2022-08-24|
|   mathchemistry|        2013-04-15|       2022-05-03|
|            food|        2017-05-31|       2022-08-24|
|    data science|        2018-06-23|       2022-08-24|
|        trolling|        2020-06-14|       2022-08-24|
|        tutorial|        2017-02-01|       2022

In [37]:
# Conte todos os 'title' de forma normal e todos os únicos e verifique se há diferença

from pyspark.sql.functions import count, countDistinct

# Conta o total de títulos (inclui repetições)
total_titles = df_video.select(count("title").alias("Total_titles")).collect()[0]["Total_titles"]

# Conta o número de títulos únicos (sem repetições)
unique_titles = df_video.select(countDistinct("title").alias("Unique_titles")).collect()[0]["Unique_titles"]

print("Total de títulos:", total_titles)
print("Títulos únicos:", unique_titles)

if total_titles != unique_titles:
    print("Existem títulos repetidos na base de dados.")
else:
    print("Todos os títulos são únicos.")

Total de títulos: 1869
Títulos únicos: 1854
Existem títulos repetidos na base de dados.


In [38]:
# Mostre a quantidade de registros ordenados por ano em ordem ascendente



from pyspark.sql.functions import year, count

# Adiciona uma coluna "Year" extraindo o ano de 'Published At'
df_with_year = df_video.withColumn("Year", year("Published At"))

# Agrupa por 'Year' e conta os registros, depois ordena em ordem crescente pelo ano
df_by_year = df_with_year.groupBy("Year").agg(
    count("*").alias("Total_Registros")
).orderBy("Year", ascending=True)

df_by_year.show()

+----+---------------+
|Year|Total_Registros|
+----+---------------+
|2007|              2|
|2008|              1|
|2009|              9|
|2010|              6|
|2011|              4|
|2012|             12|
|2013|              6|
|2014|             10|
|2015|             15|
|2016|             34|
|2017|             47|
|2018|             57|
|2019|             86|
|2020|            158|
|2021|            229|
|2022|           1193|
+----+---------------+



In [39]:
# Mostre a quantidade de registros ordenados por ano e mês em ordem ascendente

from pyspark.sql.functions import year, month, count

# Adiciona as colunas 'Year' e 'Month'
df_with_year_month = df_video.withColumn("Year", year("Published At")) \
                       .withColumn("Month", month("Published At"))

# Agrupa por 'Year' e 'Month', conta os registros e ordena em ordem crescente
df_by_year_month = df_with_year_month.groupBy("Year", "Month") \
    .agg(count("*").alias("Total_Registros")) \
    .orderBy("Year", "Month", ascending=[True, True])

df_by_year_month.show()

+----+-----+---------------+
|Year|Month|Total_Registros|
+----+-----+---------------+
|2007|    7|              1|
|2007|   12|              1|
|2008|    7|              1|
|2009|    2|              2|
|2009|    6|              2|
|2009|    7|              1|
|2009|    8|              1|
|2009|   10|              1|
|2009|   12|              2|
|2010|    3|              1|
|2010|    5|              2|
|2010|    6|              1|
|2010|    9|              1|
|2010|   10|              1|
|2011|    2|              1|
|2011|    5|              1|
|2011|    9|              1|
|2011|   10|              1|
|2012|    1|              1|
|2012|    2|              3|
+----+-----+---------------+
only showing top 20 rows


In [40]:
from pyspark.sql.functions import year, avg
from pyspark.sql.window import Window

# Adiciona uma coluna 'Year' extraída de 'Published At'
df_with_year = df_video.withColumn("Year", year("Published At"))

# Define a janela particionada por 'Keyword' e ordenada por 'Year'
window_spec = Window.partitionBy("Keyword").orderBy("Year").rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Calcula a média acumulativa dos 'Likes'
df_cumulative_avg = df_with_year.withColumn(
    "Cumulative_Mean_Likes",
    avg("Likes").over(window_spec)
)

# Para ver a média acumulada por ano para cada Keyword, agregue os resultados distintos
df_result = df_cumulative_avg.select("Keyword", "Year", "Cumulative_Mean_Likes").distinct().orderBy("Keyword", "Year")

df_result.show()

+-------+----+---------------------+
|Keyword|Year|Cumulative_Mean_Likes|
+-------+----+---------------------+
|animals|2009|            1357197.0|
|animals|2010|             712665.0|
|animals|2010|             587977.0|
|animals|2013|           3197276.75|
|animals|2014|            2761698.4|
|animals|2014|   3258727.8333333335|
|animals|2019|   2950868.5714285714|
|animals|2020|          2591337.125|
|animals|2020|   2304445.5555555555|
|animals|2020|            2090434.7|
|animals|2020|   1908948.7272727273|
|animals|2020|            1751698.5|
|animals|2020|   1619172.6923076923|
|animals|2020|    1508757.642857143|
|animals|2020|   1427024.0666666667|
|animals|2020|         1723934.3125|
|animals|2021|   1623192.4705882352|
|animals|2021|   1534520.7777777778|
|animals|2021|   1460098.6315789474|
|animals|2021|            1419090.8|
+-------+----+---------------------+
only showing top 20 rows
