In [46]:
// Assume you're given a table Twitter tweet data, write a query to obtain a histogram of tweets posted per user in 2022. Output the tweet count per user as the bucket and the number of Twitter users who fall into that bucket.

// In other words, group the users by the number of tweets they posted in 2022 and count the number of users in each group.

// Example Output:
// |tweet_bucket | users_num |
// ---------------------------
// |1            | 2         |
// |2            | 1         |

// Explanation:
// Based on the example output, there are two users who posted only one tweet in 2022, and one user who posted two tweets in 2022. The query groups the users by the number of tweets they posted and displays the number of users in each group.

// The dataset you are querying against may have different input & output - this is just an example!


import org.apache.spark.sql.types.{IntegerType, StringType, StructType, StructField}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row

val schema = StructType(Array(
  StructField("tweet_id", IntegerType),
  StructField("user_id", IntegerType),
  StructField("msg", StringType),
  StructField("tweet_date", StringType))
)

val data = Seq(
    Row(214252,111,"Am considering taking Tesla private at $420. Funding secured.","12/30/2021 00:00:00"),
    Row(739252,111,"Despite the constant negative press covfefe","01/01/2022 00:00:00"),
    Row(846402,111,"Following @NickSinghTech on Twitter changed my life!","02/14/2022 00:00:00"),
    Row(241425,254,"If the salary is so competitive why won’t you tell me what it is?","03/01/2022 00:00:00"),
    Row(231574,148,"I no longer have a manager. I can't be managed","03/23/2022 00:00:00")
)

val rdd = spark.sparkContext.parallelize(data)
val df = spark.createDataFrame(rdd, schema)

df.show(false)

println("Using Dataframes -------- ")
val df1 = df.filter($"tweet_date".between("01/01/2022  00:00:00", "12/31/2022 00:00:00")
         ).groupBy($"user_id"
                  ).agg(count("tweet_id").as("tweets")
                       ).groupBy($"tweets").agg(count("user_id").as("users_num"))

df1.explain()
df1.show(false)





println("Using Spark SQL -------- ")
df.createOrReplaceTempView("tweets")

val df2 = spark.sql("""SELECT 
  tweet_count_per_user AS tweet_bucket, 
  COUNT(user_id) AS users_num 
FROM (
  SELECT 
    user_id, 
    COUNT(tweet_id) AS tweet_count_per_user 
  FROM tweets 
  WHERE tweet_date BETWEEN '01/01/2022  00:00:00' 
    AND '12/31/2022 00:00:00'
  GROUP BY user_id) AS total_tweets 
GROUP BY tweet_count_per_user""")

df2.explain()
df2.show(false)



+--------+-------+-----------------------------------------------------------------+-------------------+
|tweet_id|user_id|msg                                                              |tweet_date         |
+--------+-------+-----------------------------------------------------------------+-------------------+
|214252  |111    |Am considering taking Tesla private at $420. Funding secured.    |12/30/2021 00:00:00|
|739252  |111    |Despite the constant negative press covfefe                      |01/01/2022 00:00:00|
|846402  |111    |Following @NickSinghTech on Twitter changed my life!             |02/14/2022 00:00:00|
|241425  |254    |If the salary is so competitive why won’t you tell me what it is?|03/01/2022 00:00:00|
|231574  |148    |I no longer have a manager. I can't be managed                   |03/23/2022 00:00:00|
+--------+-------+-----------------------------------------------------------------+-------------------+

Using Dataframes -------- 
== Physical Plan ==
Adaptiv

import org.apache.spark.sql.types.{IntegerType, StringType, StructType, StructField}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row
schema: org.apache.spark.sql.types.StructType = StructType(StructField(tweet_id,IntegerType,true),StructField(user_id,IntegerType,true),StructField(msg,StringType,true),StructField(tweet_date,StringType,true))
data: Seq[org.apache.spark.sql.Row] = List([214252,111,Am considering taking Tesla private at $420. Funding secured.,12/30/2021 00:00:00], [739252,111,Despite the constant negative press covfefe,01/01/2022 00:00:00], [846402,111,Following @NickSinghTech on Twitter changed my life!,02/14/2022 00:00:00], [241425,254,If the salary is so competitive why won’t you tell me what it is?,03/01/2022 00:00:00], [231574,148,I no longer ha...
