In [0]:
# link for below exercise
# https://www.geeksforgeeks.org/pyspark-window-functions/

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

ModuleNotFoundError: No module named 'pyspark'

In [5]:
spark = SparkSession.builder.appName('Pyspark SQL Like Functions').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
sales = r"/user/storage/sales.csv"
products = r"/user/storage/products.csv"
sellers = r"/user/storage/sellers.csv"
sales_df = spark.read.csv(sales, header=True, inferSchema=True)
products_df = spark.read.csv(products, header=True, inferSchema=True)
sellers_df = spark.read.csv(sellers, header=True, inferSchema=True)

                                                                                

In [7]:
from pyspark.sql.window import Window

In [8]:
# sample data for dataframe
sampleData = (("Ram", 28, "Sales", 3000),
              ("Meena", 33, "Sales", 4600),
              ("Robin", 40, "Sales", 4100),
              ("Kunal", 25, "Finance", 3000),
              ("Ram", 28, "Sales", 3000),
              ("Srishti", 46, "Management", 3300),
              ("Jeny", 26, "Finance", 3900),
              ("Hitesh", 30, "Marketing", 3000),
              ("Kailash", 29, "Marketing", 2000),
              ("Sharad", 39, "Sales", 4100)
              )

In [9]:
sampleData

(('Ram', 28, 'Sales', 3000),
 ('Meena', 33, 'Sales', 4600),
 ('Robin', 40, 'Sales', 4100),
 ('Kunal', 25, 'Finance', 3000),
 ('Ram', 28, 'Sales', 3000),
 ('Srishti', 46, 'Management', 3300),
 ('Jeny', 26, 'Finance', 3900),
 ('Hitesh', 30, 'Marketing', 3000),
 ('Kailash', 29, 'Marketing', 2000),
 ('Sharad', 39, 'Sales', 4100))

In [10]:
# column names for dataframe
columns = ["Employee_Name", "Age",
           "Department", "Salary"]

In [11]:
columns

['Employee_Name', 'Age', 'Department', 'Salary']

In [12]:
df = spark.createDataFrame(data = sampleData, schema=columns)

In [13]:
df.show()

[Stage 6:>                                                          (0 + 1) / 1]

+-------------+---+----------+------+
|Employee_Name|Age|Department|Salary|
+-------------+---+----------+------+
|          Ram| 28|     Sales|  3000|
|        Meena| 33|     Sales|  4600|
|        Robin| 40|     Sales|  4100|
|        Kunal| 25|   Finance|  3000|
|          Ram| 28|     Sales|  3000|
|      Srishti| 46|Management|  3300|
|         Jeny| 26|   Finance|  3900|
|       Hitesh| 30| Marketing|  3000|
|      Kailash| 29| Marketing|  2000|
|       Sharad| 39|     Sales|  4100|
+-------------+---+----------+------+



                                                                                

In [14]:
# importing Window from pyspark.sql.window
 
# creating a window
# partition of dataframe
windowPartition = Window.partitionBy("Department").orderBy("Age")

In [15]:
# cume_dist() window function is used to get the cumulative distribution within a window partition. It is similar to CUME_DIST in SQL

df.withColumn('cume_dist', cume_dist().over(windowPartition)).show()

                                                                                

+-------------+---+----------+------+---------+
|Employee_Name|Age|Department|Salary|cume_dist|
+-------------+---+----------+------+---------+
|          Ram| 28|     Sales|  3000|      0.4|
|          Ram| 28|     Sales|  3000|      0.4|
|        Meena| 33|     Sales|  4600|      0.6|
|       Sharad| 39|     Sales|  4100|      0.8|
|        Robin| 40|     Sales|  4100|      1.0|
|        Kunal| 25|   Finance|  3000|      0.5|
|         Jeny| 26|   Finance|  3900|      1.0|
|      Srishti| 46|Management|  3300|      1.0|
|      Kailash| 29| Marketing|  2000|      0.5|
|       Hitesh| 30| Marketing|  3000|      1.0|
+-------------+---+----------+------+---------+



In [16]:
# registering the df dataframe as a temp table
df.registerTempTable('emp_data')
spark.sql("select * from emp_data").show()

+-------------+---+----------+------+
|Employee_Name|Age|Department|Salary|
+-------------+---+----------+------+
|          Ram| 28|     Sales|  3000|
|        Meena| 33|     Sales|  4600|
|        Robin| 40|     Sales|  4100|
|        Kunal| 25|   Finance|  3000|
|          Ram| 28|     Sales|  3000|
|      Srishti| 46|Management|  3300|
|         Jeny| 26|   Finance|  3900|
|       Hitesh| 30| Marketing|  3000|
|      Kailash| 29| Marketing|  2000|
|       Sharad| 39|     Sales|  4100|
+-------------+---+----------+------+



In [17]:
# implementation of window funtion in SQL

# row_number()
# spark.sql("""select *, row_number() over(partition by Department order by Age) as rn from emp_data""").show()

# cum_dist()
spark.sql("""select *, cume_dist() over(partition by Department order by Age) as rn from emp_data""").show()

+-------------+---+----------+------+---+
|Employee_Name|Age|Department|Salary| rn|
+-------------+---+----------+------+---+
|          Ram| 28|     Sales|  3000|0.4|
|          Ram| 28|     Sales|  3000|0.4|
|        Meena| 33|     Sales|  4600|0.6|
|       Sharad| 39|     Sales|  4100|0.8|
|        Robin| 40|     Sales|  4100|1.0|
|        Kunal| 25|   Finance|  3000|0.5|
|         Jeny| 26|   Finance|  3900|1.0|
|      Srishti| 46|Management|  3300|1.0|
|      Kailash| 29| Marketing|  2000|0.5|
|       Hitesh| 30| Marketing|  3000|1.0|
+-------------+---+----------+------+---+



In [18]:
# lag() function

# A lag() function is used to access previous rowsâ€™ data as per the defined offset value in the function

df.withColumn('lag', lag('salary', 1).over(windowPartition)).show()

+-------------+---+----------+------+----+
|Employee_Name|Age|Department|Salary| lag|
+-------------+---+----------+------+----+
|          Ram| 28|     Sales|  3000|null|
|          Ram| 28|     Sales|  3000|3000|
|        Meena| 33|     Sales|  4600|3000|
|       Sharad| 39|     Sales|  4100|4600|
|        Robin| 40|     Sales|  4100|4100|
|        Kunal| 25|   Finance|  3000|null|
|         Jeny| 26|   Finance|  3900|3000|
|      Srishti| 46|Management|  3300|null|
|      Kailash| 29| Marketing|  2000|null|
|       Hitesh| 30| Marketing|  3000|2000|
+-------------+---+----------+------+----+



In [19]:
# implementation in SQL

spark.sql("""select *, lag(salary,1) over(partition by Department order by Age) as rn from emp_data""").show()

+-------------+---+----------+------+----+
|Employee_Name|Age|Department|Salary|  rn|
+-------------+---+----------+------+----+
|          Ram| 28|     Sales|  3000|null|
|          Ram| 28|     Sales|  3000|3000|
|        Meena| 33|     Sales|  4600|3000|
|       Sharad| 39|     Sales|  4100|4600|
|        Robin| 40|     Sales|  4100|4100|
|        Kunal| 25|   Finance|  3000|null|
|         Jeny| 26|   Finance|  3900|3000|
|      Srishti| 46|Management|  3300|null|
|      Kailash| 29| Marketing|  2000|null|
|       Hitesh| 30| Marketing|  3000|2000|
+-------------+---+----------+------+----+



In [20]:
# A lead() function is used to access next rows data as per the defined offset value in the function.

df.withColumn('lag', lead('salary', 2).over(windowPartition)).show()

+-------------+---+----------+------+----+
|Employee_Name|Age|Department|Salary| lag|
+-------------+---+----------+------+----+
|          Ram| 28|     Sales|  3000|4600|
|          Ram| 28|     Sales|  3000|4100|
|        Meena| 33|     Sales|  4600|4100|
|       Sharad| 39|     Sales|  4100|null|
|        Robin| 40|     Sales|  4100|null|
|        Kunal| 25|   Finance|  3000|null|
|         Jeny| 26|   Finance|  3900|null|
|      Srishti| 46|Management|  3300|null|
|      Kailash| 29| Marketing|  2000|null|
|       Hitesh| 30| Marketing|  3000|null|
+-------------+---+----------+------+----+



In [21]:
# using percent function

# df.withColumn('percent_rank', percent_rank().over(windowPartition)).show()

In [22]:
# implementation in SQL

spark.sql("""select *, lead(salary,1) over(partition by Department order by Age) as rn from emp_data""").show()

+-------------+---+----------+------+----+
|Employee_Name|Age|Department|Salary|  rn|
+-------------+---+----------+------+----+
|          Ram| 28|     Sales|  3000|3000|
|          Ram| 28|     Sales|  3000|4600|
|        Meena| 33|     Sales|  4600|4100|
|       Sharad| 39|     Sales|  4100|4100|
|        Robin| 40|     Sales|  4100|null|
|        Kunal| 25|   Finance|  3000|3900|
|         Jeny| 26|   Finance|  3900|null|
|      Srishti| 46|Management|  3300|null|
|      Kailash| 29| Marketing|  2000|3000|
|       Hitesh| 30| Marketing|  3000|null|
+-------------+---+----------+------+----+



In [23]:
## Ranking functions

# sample data for dataframe
sampleData = ((101, "Ram", "Biology", 80),
              (103, "Meena", "Social Science", 78),
              (104, "Robin", "Sanskrit", 58),
              (102, "Kunal", "Phisycs", 89),
              (101, "Ram", "Biology", 80),
              (106, "Srishti", "Maths", 70),
              (108, "Jeny", "Physics", 75),
              (107, "Hitesh", "Maths", 88),
              (109, "Kailash", "Maths", 90),
              (105, "Sharad", "Social Science", 84)
              )
 
# column names for dataframe
columns = ["Roll_No", "Student_Name", "Subject", "Marks"]
 

In [24]:
df2 = spark.createDataFrame(data = sampleData, schema=columns)

df2.show()

+-------+------------+--------------+-----+
|Roll_No|Student_Name|       Subject|Marks|
+-------+------------+--------------+-----+
|    101|         Ram|       Biology|   80|
|    103|       Meena|Social Science|   78|
|    104|       Robin|      Sanskrit|   58|
|    102|       Kunal|       Phisycs|   89|
|    101|         Ram|       Biology|   80|
|    106|     Srishti|         Maths|   70|
|    108|        Jeny|       Physics|   75|
|    107|      Hitesh|         Maths|   88|
|    109|     Kailash|         Maths|   90|
|    105|      Sharad|Social Science|   84|
+-------+------------+--------------+-----+



In [25]:
windowPartition = Window.partitionBy("Subject").orderBy(col("Marks").desc())

In [26]:
# Using row_number().
# row_number() function is used to gives a sequential number to each row present in the table

df2.withColumn('row_num', row_number().over(windowPartition)).show()

+-------+------------+--------------+-----+-------+
|Roll_No|Student_Name|       Subject|Marks|row_num|
+-------+------------+--------------+-----+-------+
|    102|       Kunal|       Phisycs|   89|      1|
|    105|      Sharad|Social Science|   84|      1|
|    103|       Meena|Social Science|   78|      2|
|    109|     Kailash|         Maths|   90|      1|
|    107|      Hitesh|         Maths|   88|      2|
|    106|     Srishti|         Maths|   70|      3|
|    108|        Jeny|       Physics|   75|      1|
|    104|       Robin|      Sanskrit|   58|      1|
|    101|         Ram|       Biology|   80|      1|
|    101|         Ram|       Biology|   80|      2|
+-------+------------+--------------+-----+-------+



In [27]:
# get the top record from each department

df2.withColumn('row_num', row_number().over(windowPartition)).show()

+-------+------------+--------------+-----+-------+
|Roll_No|Student_Name|       Subject|Marks|row_num|
+-------+------------+--------------+-----+-------+
|    102|       Kunal|       Phisycs|   89|      1|
|    105|      Sharad|Social Science|   84|      1|
|    103|       Meena|Social Science|   78|      2|
|    109|     Kailash|         Maths|   90|      1|
|    107|      Hitesh|         Maths|   88|      2|
|    106|     Srishti|         Maths|   70|      3|
|    108|        Jeny|       Physics|   75|      1|
|    104|       Robin|      Sanskrit|   58|      1|
|    101|         Ram|       Biology|   80|      1|
|    101|         Ram|       Biology|   80|      2|
+-------+------------+--------------+-----+-------+



In [28]:
# Using rank()
# The rank function is used to give ranks to rows specified in the window partition.

df2.withColumn('rank', rank().over(windowPartition)).show()

+-------+------------+--------------+-----+----+
|Roll_No|Student_Name|       Subject|Marks|rank|
+-------+------------+--------------+-----+----+
|    102|       Kunal|       Phisycs|   89|   1|
|    105|      Sharad|Social Science|   84|   1|
|    103|       Meena|Social Science|   78|   2|
|    109|     Kailash|         Maths|   90|   1|
|    107|      Hitesh|         Maths|   88|   2|
|    106|     Srishti|         Maths|   70|   3|
|    108|        Jeny|       Physics|   75|   1|
|    104|       Robin|      Sanskrit|   58|   1|
|    101|         Ram|       Biology|   80|   1|
|    101|         Ram|       Biology|   80|   1|
+-------+------------+--------------+-----+----+



In [29]:
# Using percent_rank()
# This function is similar to rank() function. It also provides rank to rows but in a percentile format

df2.withColumn('percent_rank', percent_rank().over(windowPartition)).show()

+-------+------------+--------------+-----+------------+
|Roll_No|Student_Name|       Subject|Marks|percent_rank|
+-------+------------+--------------+-----+------------+
|    102|       Kunal|       Phisycs|   89|         0.0|
|    105|      Sharad|Social Science|   84|         0.0|
|    103|       Meena|Social Science|   78|         1.0|
|    109|     Kailash|         Maths|   90|         0.0|
|    107|      Hitesh|         Maths|   88|         0.5|
|    106|     Srishti|         Maths|   70|         1.0|
|    108|        Jeny|       Physics|   75|         0.0|
|    104|       Robin|      Sanskrit|   58|         0.0|
|    101|         Ram|       Biology|   80|         0.0|
|    101|         Ram|       Biology|   80|         0.0|
+-------+------------+--------------+-----+------------+



In [30]:
# Using dense_rank()
# This function is used to get the rank of each row in the form of row numbers. This is similar to rank() function, there is only one difference the rank function leaves gaps in rank when there are ties

df2.withColumn('dense_rank', dense_rank().over(windowPartition)).show()

+-------+------------+--------------+-----+----------+
|Roll_No|Student_Name|       Subject|Marks|dense_rank|
+-------+------------+--------------+-----+----------+
|    102|       Kunal|       Phisycs|   89|         1|
|    105|      Sharad|Social Science|   84|         1|
|    103|       Meena|Social Science|   78|         2|
|    109|     Kailash|         Maths|   90|         1|
|    107|      Hitesh|         Maths|   88|         2|
|    106|     Srishti|         Maths|   70|         3|
|    108|        Jeny|       Physics|   75|         1|
|    104|       Robin|      Sanskrit|   58|         1|
|    101|         Ram|       Biology|   80|         1|
|    101|         Ram|       Biology|   80|         1|
+-------+------------+--------------+-----+----------+



In [31]:
# Aggregate function

# AVERAGE, SUM, MIN, MAX

In [32]:
# sample data for dataframe
sampleData = (("Ram", "Sales", 3000),
              ("Meena", "Sales", 4600),
              ("Robin", "Sales", 4100),
              ("Kunal", "Finance", 3000),
              ("Ram", "Sales", 3000),
              ("Srishti", "Management", 3300),
              ("Jeny", "Finance", 3900),
              ("Hitesh", "Marketing", 3000),
              ("Kailash", "Marketing", 2000),
              ("Sharad", "Sales", 4100)
              )
 
# column names for dataframe
columns = ["Employee_Name", "Department", "Salary"]

In [33]:
df3 = spark.createDataFrame(data = sampleData, schema=columns)

In [34]:
df3.show()

+-------------+----------+------+
|Employee_Name|Department|Salary|
+-------------+----------+------+
|          Ram|     Sales|  3000|
|        Meena|     Sales|  4600|
|        Robin|     Sales|  4100|
|        Kunal|   Finance|  3000|
|          Ram|     Sales|  3000|
|      Srishti|Management|  3300|
|         Jeny|   Finance|  3900|
|       Hitesh| Marketing|  3000|
|      Kailash| Marketing|  2000|
|       Sharad|     Sales|  4100|
+-------------+----------+------+



In [35]:
windowPartitionAgg  = Window.partitionBy("Department")

In [36]:
# avg()

df3.withColumn('avg', avg(col('salary')).over(windowPartitionAgg)).show()

+-------------+----------+------+------+
|Employee_Name|Department|Salary|   avg|
+-------------+----------+------+------+
|       Sharad|     Sales|  4100|3760.0|
|          Ram|     Sales|  3000|3760.0|
|        Meena|     Sales|  4600|3760.0|
|        Robin|     Sales|  4100|3760.0|
|          Ram|     Sales|  3000|3760.0|
|        Kunal|   Finance|  3000|3450.0|
|         Jeny|   Finance|  3900|3450.0|
|      Srishti|Management|  3300|3300.0|
|       Hitesh| Marketing|  3000|2500.0|
|      Kailash| Marketing|  2000|2500.0|
+-------------+----------+------+------+



In [37]:
# sum()

df3.withColumn('avg', sum(col('salary')).over(windowPartitionAgg)).show()

+-------------+----------+------+-----+
|Employee_Name|Department|Salary|  avg|
+-------------+----------+------+-----+
|       Sharad|     Sales|  4100|18800|
|          Ram|     Sales|  3000|18800|
|        Meena|     Sales|  4600|18800|
|        Robin|     Sales|  4100|18800|
|          Ram|     Sales|  3000|18800|
|         Jeny|   Finance|  3900| 6900|
|        Kunal|   Finance|  3000| 6900|
|      Srishti|Management|  3300| 3300|
|       Hitesh| Marketing|  3000| 5000|
|      Kailash| Marketing|  2000| 5000|
+-------------+----------+------+-----+



In [38]:
# min()

df3.withColumn('avg', min(col('salary')).over(windowPartitionAgg)).show()

+-------------+----------+------+----+
|Employee_Name|Department|Salary| avg|
+-------------+----------+------+----+
|          Ram|     Sales|  3000|3000|
|        Meena|     Sales|  4600|3000|
|        Robin|     Sales|  4100|3000|
|          Ram|     Sales|  3000|3000|
|       Sharad|     Sales|  4100|3000|
|         Jeny|   Finance|  3900|3000|
|        Kunal|   Finance|  3000|3000|
|      Srishti|Management|  3300|3300|
|       Hitesh| Marketing|  3000|2000|
|      Kailash| Marketing|  2000|2000|
+-------------+----------+------+----+



In [39]:
# max()

df3.withColumn('avg', max(col('salary')).over(windowPartitionAgg)).show()

+-------------+----------+------+----+
|Employee_Name|Department|Salary| avg|
+-------------+----------+------+----+
|          Ram|     Sales|  3000|4600|
|        Meena|     Sales|  4600|4600|
|        Robin|     Sales|  4100|4600|
|          Ram|     Sales|  3000|4600|
|       Sharad|     Sales|  4100|4600|
|         Jeny|   Finance|  3900|3900|
|        Kunal|   Finance|  3000|3900|
|      Srishti|Management|  3300|3300|
|       Hitesh| Marketing|  3000|3000|
|      Kailash| Marketing|  2000|3000|
+-------------+----------+------+----+

