In [7]:
// Tesla is investigating production bottlenecks and they need your help to extract the relevant data. 
// Write a query to determine which parts have begun the assembly process but are not yet finished.

// Assumptions:
// parts_assembly table contains all parts currently in production, each at varying stages of the assembly process.
// An unfinished part is one that lacks a finish_date.
// This question is straightforward, so let's approach it with simplicity in both thinking and solution.

// Effective April 11th 2023, the problem statement and assumptions were updated to enhance clarity.

// Example Output
// part	assembly_step
// bumper	3
// bumper	4

// Explanation
// The bumpers in step 3 and 4 are the only item that remains unfinished as it lacks a recorded finish date.

import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row


val data = Seq(
  Row("battery","01/22/2022 00:00:00",1),
  Row("battery","02/22/2022 00:00:00",2),
  Row("battery","03/22/2022 00:00:00",3),
  Row("bumper","01/22/2022 00:00:00",1),
  Row("bumper","02/22/2022 00:00:00",2),
  Row("bumper",null.asInstanceOf[String],3),
  Row("bumper",null.asInstanceOf[String],4)
)

val schema = StructType(Array(
    StructField("part", StringType),
    StructField("finish_date", StringType),
    StructField("step", IntegerType)    
))

val rdd = spark.sparkContext.parallelize(data)
val df = spark.createDataFrame(rdd, schema)

df.show(false)



+-------+-------------------+----+
|part   |finish_date        |step|
+-------+-------------------+----+
|battery|01/22/2022 00:00:00|1   |
|battery|02/22/2022 00:00:00|2   |
|battery|03/22/2022 00:00:00|3   |
|bumper |01/22/2022 00:00:00|1   |
|bumper |02/22/2022 00:00:00|2   |
|bumper |null               |3   |
|bumper |null               |4   |
+-------+-------------------+----+



import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row
data: Seq[org.apache.spark.sql.Row] = List([battery,01/22/2022 00:00:00,1], [battery,02/22/2022 00:00:00,2], [battery,03/22/2022 00:00:00,3], [bumper,01/22/2022 00:00:00,1], [bumper,02/22/2022 00:00:00,2], [bumper,null,3], [bumper,null,4])
schema: org.apache.spark.sql.types.StructType = StructType(StructField(part,StringType,true),StructField(finish_date,StringType,true),StructField(step,IntegerType,true))
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = ParallelCollectionRDD[12] at parallelize at <console>:76
df: org.apache.spark.sql.DataFrame = [part: string, finish_date: string ... 1 more field]


In [8]:
println("Using Dataframes -------- ")

val df1 = df.select("part", "step").where($"finish_date".isNull)

df1.explain()
df1.show(false)


Using Dataframes -------- 
== Physical Plan ==
*(1) Project [part#71, step#73]
+- *(1) Filter isnull(finish_date#72)
   +- *(1) Scan ExistingRDD[part#71,finish_date#72,step#73]


+------+----+
|part  |step|
+------+----+
|bumper|3   |
|bumper|4   |
+------+----+



df1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [part: string, step: int]


In [13]:
println("Using Spark SQL -------- ")

df.createOrReplaceTempView("parts_assembly")

val df2 = spark.sql("""
    SELECT
        part,
        step
    FROM parts_assembly
    WHERE finish_date IS NULL
""")

df2.explain()
df2.show(false)


Using Spark SQL -------- 
== Physical Plan ==
*(1) Project [part#71, step#73]
+- *(1) Filter isnull(finish_date#72)
   +- *(1) Scan ExistingRDD[part#71,finish_date#72,step#73]


+------+----+
|part  |step|
+------+----+
|bumper|3   |
|bumper|4   |
+------+----+



df2: org.apache.spark.sql.DataFrame = [part: string, step: int]
