In [0]:
# link for below exercise
# https://www.geeksforgeeks.org/pyspark-window-functions/

In [0]:
from pyspark.sql.functions import *

In [0]:
sales = r"/FileStore/tables/sales.csv"
products = r"/FileStore/tables/products.csv"
sellers = r"/FileStore/tables/sellers.csv"
sales_df = spark.read.csv(sales, header=True, inferSchema=True)
products_df = spark.read.csv(products, header=True, inferSchema=True)
sellers_df = spark.read.csv(sellers, header=True, inferSchema=True)

In [0]:
from pyspark.sql.window import Window

In [0]:
# sample data for dataframe
sampleData = (("Ram", 28, "Sales", 3000),
              ("Meena", 33, "Sales", 4600),
              ("Robin", 40, "Sales", 4100),
              ("Kunal", 25, "Finance", 3000),
              ("Ram", 28, "Sales", 3000),
              ("Srishti", 46, "Management", 3300),
              ("Jeny", 26, "Finance", 3900),
              ("Hitesh", 30, "Marketing", 3000),
              ("Kailash", 29, "Marketing", 2000),
              ("Sharad", 39, "Sales", 4100)
              )

In [0]:
sampleData

In [0]:
# column names for dataframe
columns = ["Employee_Name", "Age",
           "Department", "Salary"]

In [0]:
columns

In [0]:
df = spark.createDataFrame(data = sampleData, schema=columns)

In [0]:
df.show()

In [0]:
# importing Window from pyspark.sql.window
 
# creating a window
# partition of dataframe
windowPartition = Window.partitionBy("Department").orderBy("Age")

In [0]:
# cume_dist() window function is used to get the cumulative distribution within a window partition. It is similar to CUME_DIST in SQL

df.withColumn('cume_dist', cume_dist().over(windowPartition)).show()

In [0]:
# registering the df dataframe as a temp table
df.registerTempTable('emp_data')
spark.sql("select * from emp_data").show()

In [0]:
# implementation of window funtion in SQL

# row_number()
# spark.sql("""select *, row_number() over(partition by Department order by Age) as rn from emp_data""").show()

# cum_dist()
spark.sql("""select *, cume_dist() over(partition by Department order by Age) as rn from emp_data""").show()

In [0]:
# lag() function

# A lag() function is used to access previous rows’ data as per the defined offset value in the function

df.withColumn('lag', lag('salary', 1).over(windowPartition)).show()

In [0]:
# implementation in SQL

spark.sql("""select *, lag(salary,1) over(partition by Department order by Age) as rn from emp_data""").show()

In [0]:
# A lead() function is used to access next rows data as per the defined offset value in the function.

df.withColumn('lag', lead('salary', 2).over(windowPartition)).show()

In [0]:
# using percent function

# df.withColumn('percent_rank', percent_rank().over(windowPartition)).show()

In [0]:
# implementation in SQL

spark.sql("""select *, lead(salary,1) over(partition by Department order by Age) as rn from emp_data""").show()

In [0]:
## Ranking functions

# sample data for dataframe
sampleData = ((101, "Ram", "Biology", 80),
              (103, "Meena", "Social Science", 78),
              (104, "Robin", "Sanskrit", 58),
              (102, "Kunal", "Phisycs", 89),
              (101, "Ram", "Biology", 80),
              (106, "Srishti", "Maths", 70),
              (108, "Jeny", "Physics", 75),
              (107, "Hitesh", "Maths", 88),
              (109, "Kailash", "Maths", 90),
              (105, "Sharad", "Social Science", 84)
              )
 
# column names for dataframe
columns = ["Roll_No", "Student_Name", "Subject", "Marks"]
 

In [0]:
df2 = spark.createDataFrame(data = sampleData, schema=columns)

df2.show()

In [0]:
windowPartition = Window.partitionBy("Subject").orderBy(col("Marks").desc())

In [0]:
# Using row_number().
# row_number() function is used to gives a sequential number to each row present in the table

df2.withColumn('row_num', row_number().over(windowPartition)).show()

In [0]:
# get the top record from each department

df2.withColumn('row_num', row_number().over(windowPartition)).show()

In [0]:
# Using rank()
# The rank function is used to give ranks to rows specified in the window partition.

df2.withColumn('rank', rank().over(windowPartition)).show()

In [0]:
# Using percent_rank()
# This function is similar to rank() function. It also provides rank to rows but in a percentile format

df2.withColumn('percent_rank', percent_rank().over(windowPartition)).show()

In [0]:
# Using dense_rank()
# This function is used to get the rank of each row in the form of row numbers. This is similar to rank() function, there is only one difference the rank function leaves gaps in rank when there are ties

df2.withColumn('dense_rank', dense_rank().over(windowPartition)).show()

In [0]:
# Aggregate function

# AVERAGE, SUM, MIN, MAX

In [0]:
# sample data for dataframe
sampleData = (("Ram", "Sales", 3000),
              ("Meena", "Sales", 4600),
              ("Robin", "Sales", 4100),
              ("Kunal", "Finance", 3000),
              ("Ram", "Sales", 3000),
              ("Srishti", "Management", 3300),
              ("Jeny", "Finance", 3900),
              ("Hitesh", "Marketing", 3000),
              ("Kailash", "Marketing", 2000),
              ("Sharad", "Sales", 4100)
              )
 
# column names for dataframe
columns = ["Employee_Name", "Department", "Salary"]

In [0]:
df3 = spark.createDataFrame(data = sampleData, schema=columns)

In [0]:
df3.show()

In [0]:
windowPartitionAgg  = Window.partitionBy("Department")

In [0]:
# avg()

df3.withColumn('avg', avg(col('salary')).over(windowPartitionAgg)).show()

In [0]:
# sum()

df3.withColumn('avg', sum(col('salary')).over(windowPartitionAgg)).show()

In [0]:
# min()

df3.withColumn('avg', min(col('salary')).over(windowPartitionAgg)).show()

In [0]:
# max()

df3.withColumn('avg', max(col('salary')).over(windowPartitionAgg)).show()