In [9]:
// Write a query to identify the top 2 Power Users who sent the highest number of messages on Microsoft Teams 
// in August 2022. Display the IDs of these 2 users along with the total number of messages they sent. 
// Output the results in descending order based on the count of the messages.

// Assumption:
// No two users have sent the same number of messages in August 2022.

// Example Output:
//------------------------------
// sender_id | message_count
//-----------|-------------------
// 3601	     |     2
// 4500	     |     1

import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row


val schema = StructType(Array(
  StructField("message_id", IntegerType),
  StructField("sender_id", IntegerType),
  StructField("receiver_id", IntegerType),
  StructField("content", StringType),
  StructField("sent_date", StringType)
))

val data = Seq(
  Row(901,3601,4500,"You up?","2022-08-03 00:00:00"),
  Row(902,4500,3601,"Only if you're buying","2022-08-03 00:00:00"),
  Row(743,3601,8752,"Let's take this offline","2022-06-14 00:00:00"),
  Row(922,3601,4500,"Get on the call","2022-08-10 00:00:00")
)

val rdd = spark.sparkContext.parallelize(data)
val df = spark.createDataFrame(rdd, schema).withColumn("sent_date", to_timestamp($"sent_date"))


df.show(false)


+----------+---------+-----------+-----------------------+-------------------+
|message_id|sender_id|receiver_id|content                |sent_date          |
+----------+---------+-----------+-----------------------+-------------------+
|901       |3601     |4500       |You up?                |2022-08-03 00:00:00|
|902       |4500     |3601       |Only if you're buying  |2022-08-03 00:00:00|
|743       |3601     |8752       |Let's take this offline|2022-06-14 00:00:00|
|922       |3601     |4500       |Get on the call        |2022-08-10 00:00:00|
+----------+---------+-----------+-----------------------+-------------------+



import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row
schema: org.apache.spark.sql.types.StructType = StructType(StructField(message_id,IntegerType,true),StructField(sender_id,IntegerType,true),StructField(receiver_id,IntegerType,true),StructField(content,StringType,true),StructField(sent_date,StringType,true))
data: Seq[org.apache.spark.sql.Row] = List([901,3601,4500,You up?,2022-08-03 00:00:00], [902,4500,3601,Only if you're buying,2022-08-03 00:00:00], [743,3601,8752,Let's take this offline,2022-06-14 00:00:00], [922,3601,4500,Get on the call,2022-08-10 00:00:00])
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = ParallelCollectionRDD[24] at parallelize at <console>:82
df: org.apache.spark.sql.DataFrame = [message_id: int,...


In [13]:
println("Using Dataframes -------- ")

val df1 = df.filter($"sent_date".between("2022-08-01 00:00:00", "2022-08-31 00:00:00")
                   ).groupBy($"sender_id").agg(count($"message_id").as("message_count")
                                              ).orderBy($"message_count".desc).limit(2)


df1.explain()
df1.show(false)


Using Dataframes -------- 
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=2, orderBy=[message_count#311L DESC NULLS LAST], output=[sender_id#227,message_count#311L])
   +- HashAggregate(keys=[sender_id#227], functions=[count(message_id#226)])
      +- Exchange hashpartitioning(sender_id#227, 200), ENSURE_REQUIREMENTS, [plan_id=221]
         +- HashAggregate(keys=[sender_id#227], functions=[partial_count(message_id#226)])
            +- Project [message_id#226, sender_id#227]
               +- Filter (isnotnull(sent_date#230) AND ((cast(sent_date#230 as timestamp) >= 2022-08-01 00:00:00) AND (cast(sent_date#230 as timestamp) <= 2022-08-31 00:00:00)))
                  +- Scan ExistingRDD[message_id#226,sender_id#227,receiver_id#228,content#229,sent_date#230]


+---------+-------------+
|sender_id|message_count|
+---------+-------------+
|3601     |2            |
|4500     |1            |
+---------+-------------+



df1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [sender_id: int, message_count: bigint]


In [14]:
println("Using Spark SQL -------- ")

df.createOrReplaceTempView("messages")

val df2 = spark.sql("""
    SELECT
        sender_id,
        COUNT(message_id) message_count
    FROM messages
    WHERE sent_date BETWEEN '2022-08-01 00:00:00' AND '2022-08-31 00:00:00'
    GROUP BY sender_id 
    ORDER BY message_count DESC
    LIMIT 2
""")


df2.explain()
df2.show(false)


Using Spark SQL -------- 
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=2, orderBy=[message_count#326L DESC NULLS LAST], output=[sender_id#227,message_count#326L])
   +- HashAggregate(keys=[sender_id#227], functions=[count(message_id#226)])
      +- Exchange hashpartitioning(sender_id#227, 200), ENSURE_REQUIREMENTS, [plan_id=301]
         +- HashAggregate(keys=[sender_id#227], functions=[partial_count(message_id#226)])
            +- Project [message_id#226, sender_id#227]
               +- Filter (isnotnull(sent_date#230) AND ((cast(sent_date#230 as timestamp) >= 2022-08-01 00:00:00) AND (cast(sent_date#230 as timestamp) <= 2022-08-31 00:00:00)))
                  +- Scan ExistingRDD[message_id#226,sender_id#227,receiver_id#228,content#229,sent_date#230]


+---------+-------------+
|sender_id|message_count|
+---------+-------------+
|3601     |2            |
|4500     |1            |
+---------+-------------+



df2: org.apache.spark.sql.DataFrame = [sender_id: int, message_count: bigint]
