In [30]:
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
import org.apache.spark.sql._
import spark.implicits._


val data = Seq(
    Row("c1", "New York", "Lima"),
    Row("c1", "London", "New York"),
    Row("c1", "Lima", "Sao Paulo"),
    Row("c1", "Sao Paulo", "New Delhi"),
    Row("c2", "Mumbai", "Hyderabad"),
    Row("c2", "Surat", "Pune"),
    Row("c2", "Hyderabad", "Surat"),
    Row("c3", "Kochi", "Kurnool"),
    Row("c3", "Lucknow", "Agra"),
    Row("c3", "Agra", "Jaipur"),
    Row("c3", "Jaipur", "Kochi")) 

val schema = StructType(Array(
    StructField("customer", StringType),
    StructField("start_location", StringType),
    StructField("end_location", StringType)
))

val rdd = spark.sparkContext.parallelize(data)

val df = spark.createDataFrame(rdd, schema)

df.show()



+--------+--------------+------------+
|customer|start_location|end_location|
+--------+--------------+------------+
|      c1|      New York|        Lima|
|      c1|        London|    New York|
|      c1|          Lima|   Sao Paulo|
|      c1|     Sao Paulo|   New Delhi|
|      c2|        Mumbai|   Hyderabad|
|      c2|         Surat|        Pune|
|      c2|     Hyderabad|       Surat|
|      c3|         Kochi|     Kurnool|
|      c3|       Lucknow|        Agra|
|      c3|          Agra|      Jaipur|
|      c3|        Jaipur|       Kochi|
+--------+--------------+------------+



import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
import org.apache.spark.sql._
import spark.implicits._
data: Seq[org.apache.spark.sql.Row] = List([c1,New York,Lima], [c1,London,New York], [c1,Lima,Sao Paulo], [c1,Sao Paulo,New Delhi], [c2,Mumbai,Hyderabad], [c2,Surat,Pune], [c2,Hyderabad,Surat], [c3,Kochi,Kurnool], [c3,Lucknow,Agra], [c3,Agra,Jaipur], [c3,Jaipur,Kochi])
schema: org.apache.spark.sql.types.StructType = StructType(StructField(customer,StringType,true),StructField(start_location,StringType,true),StructField(end_location,StringType,true))
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = ParallelCollectionRDD[86] at parallelize at <console>:66
df: org.apache.spark.sql.DataFrame = [customer: string, start_location: string ... 1 more field]


In [9]:
val df_start = df.select($"customer", $"start_location").withColumnRenamed("start_location","location")
val df_end = df.select($"customer", $"end_location").withColumnRenamed("end_location","location")

df_start.show(false)
df_end.show(false)

+--------+---------+
|customer|location |
+--------+---------+
|c1      |New York |
|c1      |London   |
|c1      |Lima     |
|c1      |Sao Paulo|
|c2      |Mumbai   |
|c2      |Surat    |
|c2      |Hyderabad|
|c3      |Kochi    |
|c3      |Lucknow  |
|c3      |Agra     |
|c3      |Jaipur   |
+--------+---------+

+--------+---------+
|customer|location |
+--------+---------+
|c1      |Lima     |
|c1      |New York |
|c1      |Sao Paulo|
|c1      |New Delhi|
|c2      |Hyderabad|
|c2      |Pune     |
|c2      |Surat    |
|c3      |Kurnool  |
|c3      |Agra     |
|c3      |Jaipur   |
|c3      |Kochi    |
+--------+---------+



df_start: org.apache.spark.sql.DataFrame = [customer: string, location: string]
df_end: org.apache.spark.sql.DataFrame = [customer: string, location: string]


In [12]:
val df_combined = df_start.union(df_end)

df_combined.show(false)

+--------+---------+
|customer|location |
+--------+---------+
|c1      |New York |
|c1      |London   |
|c1      |Lima     |
|c1      |Sao Paulo|
|c2      |Mumbai   |
|c2      |Surat    |
|c2      |Hyderabad|
|c3      |Kochi    |
|c3      |Lucknow  |
|c3      |Agra     |
|c3      |Jaipur   |
|c1      |Lima     |
|c1      |New York |
|c1      |Sao Paulo|
|c1      |New Delhi|
|c2      |Hyderabad|
|c2      |Pune     |
|c2      |Surat    |
|c3      |Kurnool  |
|c3      |Agra     |
+--------+---------+
only showing top 20 rows



df_combined: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [customer: string, location: string]


In [35]:
val location_df = df_combined.groupBy($"customer", $"location"
                                     ).agg(count("*").as("cn")
                                     ).where($"cn"===1).orderBy($"customer").drop("cn")

location_df.show(false)

+--------+---------+
|customer|location |
+--------+---------+
|c1      |London   |
|c1      |New Delhi|
|c2      |Mumbai   |
|c2      |Pune     |
|c3      |Lucknow  |
|c3      |Kurnool  |
+--------+---------+



location_df: org.apache.spark.sql.DataFrame = [customer: string, location: string]


In [38]:
val final_df = df.as("a").join(location_df.as("b"), 
                $"a.customer" === $"b.customer" && 
                $"a.start_location" === $"b.location" || 
                $"a.end_location" === $"b.location",
                "inner"
                ).drop($"b.customer")

final_df.show(false)

+--------+--------------+------------+---------+
|customer|start_location|end_location|location |
+--------+--------------+------------+---------+
|c1      |London        |New York    |London   |
|c1      |Sao Paulo     |New Delhi   |New Delhi|
|c2      |Mumbai        |Hyderabad   |Mumbai   |
|c2      |Surat         |Pune        |Pune     |
|c3      |Kochi         |Kurnool     |Kurnool  |
|c3      |Lucknow       |Agra        |Lucknow  |
+--------+--------------+------------+---------+



final_df: org.apache.spark.sql.DataFrame = [customer: string, start_location: string ... 2 more fields]


In [44]:
final_df.withColumn("start_loc", when($"start_location" === $"location", $"location")
        ).withColumn("end_loc", when($"end_location" === $"location", $"location")
        ).groupBy($"customer").agg(min("start_loc").as("start_location"), min("end_loc").as("end_location")
        ).show(false)

+--------+--------------+------------+
|customer|start_location|end_location|
+--------+--------------+------------+
|c1      |London        |New Delhi   |
|c2      |Mumbai        |Pune        |
|c3      |Lucknow       |Kurnool     |
+--------+--------------+------------+

