In [0]:
"""
https://www.youtube.com/watch?v=f9zMAfuH0z4&list=PPSV
https://www.youtube.com/watch?v=JHaF3PFzOmk&list=PLiDUHlGx0KN8fBg645K15BIIP8jroHYpi&index=4
"""

from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window 

schema = StructType(
    [ 
        StructField("user_id", IntegerType(), True), 
        StructField("kit_id", IntegerType(), True), 
        StructField("login_date", StringType(), True), 
        StructField("sessions_count", IntegerType(), True) 
    ]
) 

data = [ 
            (1, 2, "2016-03-01", 5), 
            (1, 2, "2016-03-02", 6), 
            (2, 3, "2017-06-25", 1), 
            (3, 1, "2016-03-02", 0), 
            (3, 4, "2018-07-03", 5) 
        ] 

inputDF = spark.createDataFrame(data, schema=schema).withColumn("login_date", col("login_date").cast("date"))

inputDF.show()
inputDF.printSchema()

+-------+------+----------+--------------+
|user_id|kit_id|login_date|sessions_count|
+-------+------+----------+--------------+
|      1|     2|2016-03-01|             5|
|      1|     2|2016-03-02|             6|
|      2|     3|2017-06-25|             1|
|      3|     1|2016-03-02|             0|
|      3|     4|2018-07-03|             5|
+-------+------+----------+--------------+

root
 |-- user_id: integer (nullable = true)
 |-- kit_id: integer (nullable = true)
 |-- login_date: date (nullable = true)
 |-- sessions_count: integer (nullable = true)



In [0]:
"""
    User activity analysis : Extract 1st login for each user
"""
# 1. Use this only when other cols are not required as groupBy will not select other cols in select projection
inputDF.groupBy("user_id").agg(min("login_date").alias("login_date")).show()

# 2. Flexible, retains all cols, this approach and spark.sql will create same query plan, all depends on how your org wants to write code which is either using Spark DF API or spark sql 
inputDF.withColumn("rn", row_number().over(Window.partitionBy("user_id").orderBy("login_date"))) \
    .filter(col("rn") == 1) \
    .select("user_id", "login_date") \
    .show()

# 3. Spark sql equivalent to Spark DF API code, all depends on how your org wants to write code which is either using Spark DF API or spark sql 
inputDF.createOrReplaceTempView("input_tbl")
spark.sql("""
            WITH rn_table AS (
                SELECT
                    *,
                    ROW_NUMBER() OVER(PARTITION BY user_id ORDER BY login_date) AS rn
                FROM input_tbl
            )
            SELECT user_id, login_date FROM rn_table where rn=1
          """).show()

+-------+----------+
|user_id|login_date|
+-------+----------+
|      1|2016-03-01|
|      2|2017-06-25|
|      3|2016-03-02|
+-------+----------+

+-------+----------+
|user_id|login_date|
+-------+----------+
|      1|2016-03-01|
|      2|2017-06-25|
|      3|2016-03-02|
+-------+----------+

+-------+----------+
|user_id|login_date|
+-------+----------+
|      1|2016-03-01|
|      2|2017-06-25|
|      3|2016-03-02|
+-------+----------+



In [0]:
"""
    User activity analysis : Write a solution to report for each player and date how many games played so far by the player. That is, the running sum of games played by the player.
"""

inputDF.withColumn("running_count_of_sessions", sum("sessions_count").over(Window.partitionBy("user_id").orderBy("login_date"))) \
    .select("user_id", "login_date", "running_count_of_sessions") \
    .show()


inputDF.createOrReplaceTempView("input_tbl")
spark.sql("""
          WITH CTE AS (
            SELECT 
                *, 
                SUM(sessions_count) OVER(PARTITION BY user_id ORDER BY login_date) AS running_count_of_sessions
            FROM input_tbl
          )
          SELECT user_id, login_date, running_count_of_sessions FROM CTE 
          """).show()


inputDF.withColumn("running_count_of_sessions", sum("sessions_count").over(Window.partitionBy("user_id").orderBy("login_date"))) \
    .select("user_id", "login_date", "running_count_of_sessions").explain()

spark.sql("""
          WITH CTE AS (
            SELECT 
                *, 
                SUM(sessions_count) OVER(PARTITION BY user_id ORDER BY login_date) AS running_count_of_sessions
            FROM input_tbl
          )
          SELECT user_id, login_date, running_count_of_sessions FROM CTE 
          """).explain() 


+-------+----------+-------------------------+
|user_id|login_date|running_count_of_sessions|
+-------+----------+-------------------------+
|      1|2016-03-01|                        5|
|      1|2016-03-02|                       11|
|      2|2017-06-25|                        1|
|      3|2016-03-02|                        0|
|      3|2018-07-03|                        5|
+-------+----------+-------------------------+

+-------+----------+-------------------------+
|user_id|login_date|running_count_of_sessions|
+-------+----------+-------------------------+
|      1|2016-03-01|                        5|
|      1|2016-03-02|                       11|
|      2|2017-06-25|                        1|
|      3|2016-03-02|                        0|
|      3|2018-07-03|                        5|
+-------+----------+-------------------------+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Window [user_id#73, login_date#81, sum(sessions_count#76) windowspecdefinition(user_id#73, lo

In [0]:
"""
    User activity analysis : You need to find the user_ids that logged in for at least two consecutive days starting from their first login date
"""
inputDF.withColumn("first_login_date", min("login_date").over(Window.partitionBy("user_id").orderBy("login_date"))) \
    .withColumn("next_date", lead("login_date").over(Window.partitionBy("user_id").orderBy("login_date"))) \
    .filter(datediff("next_date", "first_login_date") == 1) \
    .select("user_id") \
    .show()

inputDF.createOrReplaceTempView("input_tbl")
spark.sql("""
            WITH CTE AS (
                SELECT
                    *,
                    MIN(login_date) OVER(PARTITION BY user_id ORDER BY login_date) AS first_login_date,
                    LEAD(login_date) OVER(PARTITION BY user_id ORDER BY login_date) AS next_date
                FROM input_tbl
            )
            SELECT 
                user_id 
            FROM CTE
            WHERE DATEDIFF(next_date, first_login_date) = 1
          """).show()


inputDF.withColumn("first_login_date", min("login_date").over(Window.partitionBy("user_id").orderBy("login_date"))) \
    .withColumn("next_date", lead("login_date").over(Window.partitionBy("user_id").orderBy("login_date"))) \
    .filter(datediff("next_date", "first_login_date") == 1) \
    .select("user_id").explain()

spark.sql("""
    WITH CTE AS (
        SELECT
            *,
            MIN(login_date) OVER(PARTITION BY user_id ORDER BY login_date) AS first_login_date,
            LEAD(login_date) OVER(PARTITION BY user_id ORDER BY login_date) AS next_date
        FROM input_tbl
)
SELECT 
    user_id 
FROM CTE
WHERE DATEDIFF(next_date, first_login_date) = 1
""").explain()

+-------+
|user_id|
+-------+
|      1|
+-------+

+-------+
|user_id|
+-------+
|      1|
+-------+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [user_id#73]
   +- Filter ((isnotnull(next_date#1041) AND isnotnull(first_login_date#1035)) AND (datediff(next_date#1041, first_login_date#1035) = 1))
      +- Window [user_id#73, min(login_date#81) windowspecdefinition(user_id#73, login_date#81 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS first_login_date#1035, lead(login_date#81, 1, null) windowspecdefinition(user_id#73, login_date#81 ASC NULLS FIRST, specifiedwindowframe(RowFrame, 1, 1)) AS next_date#1041], [user_id#73], [login_date#81 ASC NULLS FIRST]
         +- Sort [user_id#73 ASC NULLS FIRST, login_date#81 ASC NULLS FIRST], false, 0
            +- Exchange hashpartitioning(user_id#73, 200), ENSURE_REQUIREMENTS, [plan_id=3201]
               +- Project [user_id#73, cast(login_date#75 as date) AS login_date#81]
      