In [2]:
// Assume you're given the table on user viewership categorised by device type where the three types are laptop, tablet, and phone.

// Write a query that calculates the total viewership for laptops and mobile devices where mobile is defined as the sum of tablet and phone viewership. 
// Output the total viewership for laptops as laptop_reviews and the total viewership for mobile devices as mobile_views.

// Effective 15 April 2023, the solution has been updated with a more concise and easy-to-understand approach.

// Example Output
// ----------------------------
// laptop_views | mobile_views
// -------------|--------------
// 2            | 3
// ----------------------------

// Explanation
// Based on the example input, there are a total of 2 laptop views and 3 mobile views.

// The dataset you are querying against may have different input & output - this is just an example!


import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, StringType}
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.Window



val data = Seq(
  Row(123,"tablet","01/02/2022 00:00:00"),
  Row(125,"laptop","01/07/2022 00:00:00"),
  Row(128,"laptop","02/09/2022 00:00:00"),
  Row(129,"phone","02/09/2022 00:00:00"),
  Row(145,"tablet","02/24/2022 00:00:00")
)

val schema = StructType(Array(
  StructField("user_id", IntegerType),
  StructField("device_type", StringType),
  StructField("view_time", StringType)
))

val rdd = spark.sparkContext.parallelize(data)
val df = spark.createDataFrame(rdd, schema)

df.show(false)


+-------+-----------+-------------------+
|user_id|device_type|view_time          |
+-------+-----------+-------------------+
|123    |tablet     |01/02/2022 00:00:00|
|125    |laptop     |01/07/2022 00:00:00|
|128    |laptop     |02/09/2022 00:00:00|
|129    |phone      |02/09/2022 00:00:00|
|145    |tablet     |02/24/2022 00:00:00|
+-------+-----------+-------------------+



import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, StringType}
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.Window
data: Seq[org.apache.spark.sql.Row] = List([123,tablet,01/02/2022 00:00:00], [125,laptop,01/07/2022 00:00:00], [128,laptop,02/09/2022 00:00:00], [129,phone,02/09/2022 00:00:00], [145,tablet,02/24/2022 00:00:00])
schema: org.apache.spark.sql.types.StructType = StructType(StructField(user_id,IntegerType,true),StructField(device_type,StringType,true),StructField(view_time,StringType,true))
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = ParallelCollectionRDD[0] at parallelize at <console>:64
df: org.apache.spark.sql.DataFrame = [user_id: int, device_type: string ... 1 more field]


In [22]:
import org.apache.spark.sql.expressions.Window

println("Using Dataframes -------- ")

val df1 = df.withColumn("device_type", when($"device_type" === lit("tablet"), lit("phone")
                                  ).when($"device_type" === lit("phone"), lit("phone")).otherwise("laptop")
             ).groupBy($"device_type").count().withColumn("device_type", concat($"device_type", lit("_views")))


df1.explain()
df1.show(false)


Using Dataframes -------- 
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[device_type#347], functions=[count(1)])
   +- Exchange hashpartitioning(device_type#347, 200), ENSURE_REQUIREMENTS, [plan_id=865]
      +- HashAggregate(keys=[device_type#347], functions=[partial_count(1)])
         +- Project [CASE WHEN (device_type#4 = tablet) THEN phone WHEN (device_type#4 = phone) THEN phone ELSE laptop END AS device_type#347]
            +- Scan ExistingRDD[user_id#3,device_type#4,view_time#5]


+------------+-----+
|device_type |count|
+------------+-----+
|phone_views |3    |
|laptop_views|2    |
+------------+-----+



import org.apache.spark.sql.expressions.Window
df1: org.apache.spark.sql.DataFrame = [device_type: string, count: bigint]


In [45]:
println("Using Spark SQL  -------- ")

df.createOrReplaceTempView("viewership")

val df2 = spark.sql("""
    SELECT 
      SUM(CASE WHEN device_type='laptop' THEN 1 ELSE 0 END) AS laptop_views,
      SUM(CASE WHEN device_type='tablet' OR device_type='phone' THEN 1 ELSE 0 END) AS mobile_views
    FROM viewership
""")

df2.explain()
df2.show(false)

Using Spark SQL  -------- 
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[], functions=[sum(CASE WHEN (device_type#289 = laptop) THEN 1 ELSE 0 END), sum(CASE WHEN ((device_type#289 = tablet) OR (device_type#289 = phone)) THEN 1 ELSE 0 END)])
   +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=1329]
      +- HashAggregate(keys=[], functions=[partial_sum(CASE WHEN (device_type#289 = laptop) THEN 1 ELSE 0 END), partial_sum(CASE WHEN ((device_type#289 = tablet) OR (device_type#289 = phone)) THEN 1 ELSE 0 END)])
         +- Project [device_type#289]
            +- Scan ExistingRDD[user_id#288,device_type#289,view_time#290]


+------------+------------+
|laptop_views|mobile_views|
+------------+------------+
|2           |3           |
+------------+------------+



df2: org.apache.spark.sql.DataFrame = [laptop_views: bigint, mobile_views: bigint]
