In [0]:
"""
https://www.youtube.com/watch?v=3qEfsSC27_4
You are provided with caller's phone log history. Write a SQL to find out callers whoes first and last call 
was to the same person on the same day.
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

df = spark.createDataFrame(
    [
        (1, 2, '2019-01-01 09:00:00.000'),
        (1, 3, '2019-01-01 17:00:00.000'),
        (1, 4, '2019-01-01 23:00:00.000'),
        (2, 5, '2019-07-05 09:00:00.000'),
        (2, 3, '2019-07-05 17:00:00.000'),
        (2, 3, '2019-07-05 17:20:00.000'),
        (2, 5, '2019-07-05 23:00:00.000'),
        (2, 3, '2019-08-01 09:00:00.000'),
        (2, 3, '2019-08-01 17:00:00.000'),
        (2, 5, '2019-08-01 19:30:00.000'),
        (2, 4, '2019-08-02 09:00:00.000'),
        (2, 5, '2019-08-02 10:00:00.000'),
        (2, 5, '2019-08-02 10:45:00.000'),
        (2, 4, '2019-08-02 11:00:00.000')
    ], ["Callerid", "Recipientid", "Datecalled"]
)

df.show(truncate=False)
df.printSchema()

+--------+-----------+-----------------------+
|Callerid|Recipientid|Datecalled             |
+--------+-----------+-----------------------+
|1       |2          |2019-01-01 09:00:00.000|
|1       |3          |2019-01-01 17:00:00.000|
|1       |4          |2019-01-01 23:00:00.000|
|2       |5          |2019-07-05 09:00:00.000|
|2       |3          |2019-07-05 17:00:00.000|
|2       |3          |2019-07-05 17:20:00.000|
|2       |5          |2019-07-05 23:00:00.000|
|2       |3          |2019-08-01 09:00:00.000|
|2       |3          |2019-08-01 17:00:00.000|
|2       |5          |2019-08-01 19:30:00.000|
|2       |4          |2019-08-02 09:00:00.000|
|2       |5          |2019-08-02 10:00:00.000|
|2       |5          |2019-08-02 10:45:00.000|
|2       |4          |2019-08-02 11:00:00.000|
+--------+-----------+-----------------------+

root
 |-- Callerid: long (nullable = true)
 |-- Recipientid: long (nullable = true)
 |-- Datecalled: string (nullable = true)



In [0]:
df.withColumn("called_date", to_date(col("Datecalled"))) \
    .withColumn("first_", first("Recipientid").over(Window.partitionBy(col("Callerid"), col("called_date")).orderBy(col("Datecalled")))) \
    .withColumn("last_", last("Recipientid").over(Window.partitionBy(col("Callerid"), col("called_date")).orderBy(col("Datecalled")).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing))) \
    .filter(col("first_") == col("last_")) \
    .groupBy("Callerid", "called_date", "first_").agg(min("Datecalled").alias("first_call"), max("Datecalled").alias("last_call")) \
    .withColumnRenamed("first_", "Recipientid") \
    .show(truncate=False)

+--------+-----------+-----------+-----------------------+-----------------------+
|Callerid|called_date|Recipientid|first_call             |last_call              |
+--------+-----------+-----------+-----------------------+-----------------------+
|2       |2019-07-05 |5          |2019-07-05 09:00:00.000|2019-07-05 23:00:00.000|
|2       |2019-08-02 |4          |2019-08-02 09:00:00.000|2019-08-02 11:00:00.000|
+--------+-----------+-----------+-----------------------+-----------------------+



In [0]:
#     .groupBy("Callerid", "Recipientid", "called_date", "first_").agg(min("Datecalled"), max("Datecalled")) \