In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import functions as f
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("Badminton Court Analysis").getOrCreate()

In [3]:
spark

In [4]:
# Define the schema corresponding to the data
schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("kit_id", IntegerType(), True),
    StructField("login_date", StringType(), True),
    StructField("session_count", IntegerType(), True)
])

In [5]:
data = [
    (1, 2, "2016-03-01", 5),
    (1, 2, "2016-03-02", 6),
    (2, 3, "2017-06-25", 1),
    (3, 1, "2016-03-02", 0),
    (3, 4, "2018-07-03", 5)
]

In [6]:
# Create DataFrame
input_df = spark.createDataFrame(data, schema=schema)

In [7]:
# Problem: Determine the eaarliest login date of each user

In [8]:
# Soltion in SQL

# -- Using Window function
# -- Solution: 
# WITH ranked_cte AS
# 	(
# 		SELECT
# 			*,
# 			ROW_NUMBER() OVER (PARTITION BY  user_id ORDER BY login_date) AS rn 
# 		FROM
# 			court
# 	)
# SELECT
# 	user_id,
# 	login_date AS first_login
# FROM
# 	ranked_cte
# WHERE
# 	rn = 1;

# -- Note
# -- With aggregate or group by, we los information as we can only select aggregated column on dimension columns 
# -- by which aggregation is performed
# -- Whereas with window function we do not loose info as aggregation is performed on a window separately

In [9]:
input_df.show()

+-------+------+----------+-------------+
|user_id|kit_id|login_date|session_count|
+-------+------+----------+-------------+
|      1|     2|2016-03-01|            5|
|      1|     2|2016-03-02|            6|
|      2|     3|2017-06-25|            1|
|      3|     1|2016-03-02|            0|
|      3|     4|2018-07-03|            5|
+-------+------+----------+-------------+



In [10]:
# Approach - 1: Using Group By

In [11]:
grouped_df = input_df.groupBy("user_id").agg(
    f.min("login_date").alias("first_login")
)

In [12]:
grouped_df.show()

+-------+-----------+
|user_id|first_login|
+-------+-----------+
|      1| 2016-03-01|
|      2| 2017-06-25|
|      3| 2016-03-02|
+-------+-----------+



In [13]:
# Approach - 2: Window function
windowSpec = Window.partitionBy("user_id").orderBy("login_date")

ranked_df = input_df.withColumn(
    "rnk", f.rank().over(windowSpec)
)

In [14]:
ranked_df.show()

+-------+------+----------+-------------+---+
|user_id|kit_id|login_date|session_count|rnk|
+-------+------+----------+-------------+---+
|      1|     2|2016-03-01|            5|  1|
|      1|     2|2016-03-02|            6|  2|
|      2|     3|2017-06-25|            1|  1|
|      3|     1|2016-03-02|            0|  1|
|      3|     4|2018-07-03|            5|  2|
+-------+------+----------+-------------+---+



In [15]:
result_df = ranked_df.filter(f.col("rnk") == 1)
# or
# result_df = ranked_df.filter(ranked_df["rnk"] == 1)

In [16]:
result_df.show()

+-------+------+----------+-------------+---+
|user_id|kit_id|login_date|session_count|rnk|
+-------+------+----------+-------------+---+
|      1|     2|2016-03-01|            5|  1|
|      2|     3|2017-06-25|            1|  1|
|      3|     1|2016-03-02|            0|  1|
+-------+------+----------+-------------+---+



In [17]:
# one step closer
result = ranked_df.select(
    f.col("user_id"), 
    f.col("login_date").alias("first_login")
).filter(f.col("rnk") == 1)

In [18]:
result.show()

+-------+-----------+
|user_id|first_login|
+-------+-----------+
|      1| 2016-03-01|
|      2| 2017-06-25|
|      3| 2016-03-02|
+-------+-----------+



In [19]:
# # Some questions
# 1. Difference between sparkcontext and sparksession
# 2. what is transformation and what is action
# 3. Why transformation is lazy? And why laziness needed, it's benefits?