In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local[4]").setAppName("WindowFunction")

from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame
spark = SparkSession.builder\
                    .config(conf=config)\
                    .getOrCreate()

sc = spark.sparkContext

22/05/10 00:07:00 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.174.129 instead (on interface ens33)
22/05/10 00:07:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/05/10 00:07:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/10 00:07:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/10 00:07:12 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/05/10 00:07:12 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/05/10 00:07:12 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
2

In [16]:
data = [ ("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100),
    ("Joe", "Sales", 4200),
    ("Venkat", "Sales", 4000),
        
   ]

empDf = spark.createDataFrame(data=data, schema=['name', 'dept', 'salary'])
empDf.printSchema()
empDf.show()

empDf.rdd.getNumPartitions()

root
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: long (nullable = true)

+-------+---------+------+
|   name|     dept|salary|
+-------+---------+------+
|  James|    Sales|  3000|
|Michael|    Sales|  4600|
| Robert|    Sales|  4100|
|  Maria|  Finance|  3000|
|  James|    Sales|  3000|
|  Scott|  Finance|  3300|
|    Jen|  Finance|  3900|
|   Jeff|Marketing|  3000|
|  Kumar|Marketing|  2000|
|   Saif|    Sales|  4100|
|    Joe|    Sales|  4200|
| Venkat|    Sales|  4000|
+-------+---------+------+



4

In [4]:
empDf.rdd.glom().collect()

                                                                                

[[Row(name='James', dept='Sales', salary=3000),
  Row(name='Michael', dept='Sales', salary=4600)],
 [Row(name='Robert', dept='Sales', salary=4100),
  Row(name='Maria', dept='Finance', salary=3000)],
 [Row(name='James', dept='Sales', salary=3000),
  Row(name='Scott', dept='Finance', salary=3300)],
 [Row(name='Jen', dept='Finance', salary=3900),
  Row(name='Jeff', dept='Marketing', salary=3000),
  Row(name='Kumar', dept='Marketing', salary=2000),
  Row(name='Saif', dept='Sales', salary=4100)]]

In [5]:
empDf.write.option("header", True)\
  .partitionBy("dept")\
  .csv("/home/ubuntu/employees")

                                                                                

In [10]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

# specification for window, partitions, functions that should be applied on partition
# with in department, order the data based on salary in ascending order
windowSpec = Window.partitionBy("dept").orderBy("salary")
# we have apply the spec on dataframe
df = empDf.withColumn("slno", row_number().over(windowSpec))

df.printSchema()
df.show()

df.filter (df.slno == 1).show()

root
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- slno: integer (nullable = true)



                                                                                

+-------+---------+------+----+
|   name|     dept|salary|slno|
+-------+---------+------+----+
|  James|    Sales|  3000|   1|
|  James|    Sales|  3000|   2|
| Robert|    Sales|  4100|   3|
|   Saif|    Sales|  4100|   4|
|Michael|    Sales|  4600|   5|
|  Maria|  Finance|  3000|   1|
|  Scott|  Finance|  3300|   2|
|    Jen|  Finance|  3900|   3|
|  Kumar|Marketing|  2000|   1|
|   Jeff|Marketing|  3000|   2|
+-------+---------+------+----+





+-----+---------+------+----+
| name|     dept|salary|slno|
+-----+---------+------+----+
|James|    Sales|  3000|   1|
|Maria|  Finance|  3000|   1|
|Kumar|Marketing|  2000|   1|
+-----+---------+------+----+



                                                                                

In [11]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

# rank with gap with ascending order
"""
score  rank
90      1
90      1
89      3  [gap, 2 not included]
"""
windowSpec = Window.partitionBy("dept").orderBy("salary")

df = empDf.withColumn("rank", rank().over(windowSpec))
df.show()

                                                                                

+-------+---------+------+----+
|   name|     dept|salary|rank|
+-------+---------+------+----+
|  James|    Sales|  3000|   1|
|  James|    Sales|  3000|   1|
| Robert|    Sales|  4100|   3|
|   Saif|    Sales|  4100|   3|
|Michael|    Sales|  4600|   5|
|  Maria|  Finance|  3000|   1|
|  Scott|  Finance|  3300|   2|
|    Jen|  Finance|  3900|   3|
|  Kumar|Marketing|  2000|   1|
|   Jeff|Marketing|  3000|   2|
+-------+---------+------+----+



                                                                                

In [12]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, desc

# rank with gap
"""
score  rank
90      1
90      1
89      3  [gap, 2 not included]
"""
windowSpec = Window.partitionBy("dept").orderBy(desc("salary"))

df = empDf.withColumn("rank", rank().over(windowSpec))
df.show()

                                                                                

+-------+---------+------+----+
|   name|     dept|salary|rank|
+-------+---------+------+----+
|Michael|    Sales|  4600|   1|
| Robert|    Sales|  4100|   2|
|   Saif|    Sales|  4100|   2|
|  James|    Sales|  3000|   4|
|  James|    Sales|  3000|   4|
|    Jen|  Finance|  3900|   1|
|  Scott|  Finance|  3300|   2|
|  Maria|  Finance|  3000|   3|
|   Jeff|Marketing|  3000|   1|
|  Kumar|Marketing|  2000|   2|
+-------+---------+------+----+



                                                                                

In [13]:
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank, desc

# dense_rank ranking without gap
"""
score  rank
90      1
90      1
89      2  
"""
windowSpec = Window.partitionBy("dept").orderBy(desc("salary"))

df = empDf.withColumn("rank", dense_rank().over(windowSpec))
df.show()

                                                                                

+-------+---------+------+----+
|   name|     dept|salary|rank|
+-------+---------+------+----+
|Michael|    Sales|  4600|   1|
| Robert|    Sales|  4100|   2|
|   Saif|    Sales|  4100|   2|
|  James|    Sales|  3000|   3|
|  James|    Sales|  3000|   3|
|    Jen|  Finance|  3900|   1|
|  Scott|  Finance|  3300|   2|
|  Maria|  Finance|  3000|   3|
|   Jeff|Marketing|  3000|   1|
|  Kumar|Marketing|  2000|   2|
+-------+---------+------+----+



                                                                                

In [14]:
from pyspark.sql.window import Window
from pyspark.sql.functions import percent_rank, desc

# percent_rank ranking with perecent calculation
"""
 
"""
windowSpec = Window.partitionBy("dept").orderBy(desc("salary"))

df = empDf.withColumn("rank", percent_rank().over(windowSpec))
df.show()

                                                                                

+-------+---------+------+----+
|   name|     dept|salary|rank|
+-------+---------+------+----+
|Michael|    Sales|  4600| 0.0|
| Robert|    Sales|  4100|0.25|
|   Saif|    Sales|  4100|0.25|
|  James|    Sales|  3000|0.75|
|  James|    Sales|  3000|0.75|
|    Jen|  Finance|  3900| 0.0|
|  Scott|  Finance|  3300| 0.5|
|  Maria|  Finance|  3000| 1.0|
|   Jeff|Marketing|  3000| 0.0|
|  Kumar|Marketing|  2000| 1.0|
+-------+---------+------+----+



                                                                                

In [19]:
from pyspark.sql.window import Window
from pyspark.sql.functions import ntile, desc

# ntile ranking with related certain range for range
# rank shall fit into a range
"""
 
"""
windowSpec = Window.partitionBy("dept").orderBy(desc("salary"))

df = empDf.withColumn("rank", ntile(4).over(windowSpec))
df.show()



+-------+---------+------+----+
|   name|     dept|salary|rank|
+-------+---------+------+----+
|Michael|    Sales|  4600|   1|
|    Joe|    Sales|  4200|   1|
| Robert|    Sales|  4100|   2|
|   Saif|    Sales|  4100|   2|
| Venkat|    Sales|  4000|   3|
|  James|    Sales|  3000|   3|
|  James|    Sales|  3000|   4|
|    Jen|  Finance|  3900|   1|
|  Scott|  Finance|  3300|   2|
|  Maria|  Finance|  3000|   3|
|   Jeff|Marketing|  3000|   1|
|  Kumar|Marketing|  2000|   2|
+-------+---------+------+----+



                                                                                

In [20]:
# Analytic functions
# Cumulative distribution - similar to rank, calcualted and values are bound between 
# 0 and 1

# 10 USD per share => 13 USD per share      = 3 USD per share, 30 % gain .3
# 100 USD per share => 110 USD per share    = 10 USD per share, 10% gain .1
# cumulative distribution
from pyspark.sql.window import Window
from pyspark.sql.functions import cume_dist, desc

# similar to  rank  
 
windowSpec = Window.partitionBy("dept").orderBy(desc("salary"))

df = empDf.withColumn("cume_dist", cume_dist().over(windowSpec))
df.show()

                                                                                

+-------+---------+------+-------------------+
|   name|     dept|salary|          cume_dist|
+-------+---------+------+-------------------+
|Michael|    Sales|  4600|0.14285714285714285|
|    Joe|    Sales|  4200| 0.2857142857142857|
| Robert|    Sales|  4100| 0.5714285714285714|
|   Saif|    Sales|  4100| 0.5714285714285714|
| Venkat|    Sales|  4000| 0.7142857142857143|
|  James|    Sales|  3000|                1.0|
|  James|    Sales|  3000|                1.0|
|    Jen|  Finance|  3900| 0.3333333333333333|
|  Scott|  Finance|  3300| 0.6666666666666666|
|  Maria|  Finance|  3000|                1.0|
|   Jeff|Marketing|  3000|                0.5|
|  Kumar|Marketing|  2000|                1.0|
+-------+---------+------+-------------------+



                                                                                

In [23]:
# lag - previous lag
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, desc

windowSpec = Window.partitionBy("dept").orderBy("salary")

df = empDf.withColumn("lag", lag("salary",1).over(windowSpec))
df.show()

+-------+---------+------+----+
|   name|     dept|salary| lag|
+-------+---------+------+----+
|  James|    Sales|  3000|null|
|  James|    Sales|  3000|3000|
| Venkat|    Sales|  4000|3000|
| Robert|    Sales|  4100|4000|
|   Saif|    Sales|  4100|4100|
|    Joe|    Sales|  4200|4100|
|Michael|    Sales|  4600|4200|
|  Maria|  Finance|  3000|null|
|  Scott|  Finance|  3300|3000|
|    Jen|  Finance|  3900|3300|
|  Kumar|Marketing|  2000|null|
|   Jeff|Marketing|  3000|2000|
+-------+---------+------+----+



In [26]:
# lead -  the one ahead, 
from pyspark.sql.window import Window
from pyspark.sql.functions import lead, desc

windowSpec = Window.partitionBy("dept").orderBy("salary")

df = empDf.withColumn("lead", lead("salary", 1).over(windowSpec))
df.show()

+-------+---------+------+----+
|   name|     dept|salary|lead|
+-------+---------+------+----+
|  James|    Sales|  3000|3000|
|  James|    Sales|  3000|4000|
| Venkat|    Sales|  4000|4100|
| Robert|    Sales|  4100|4100|
|   Saif|    Sales|  4100|4200|
|    Joe|    Sales|  4200|4600|
|Michael|    Sales|  4600|null|
|  Maria|  Finance|  3000|3300|
|  Scott|  Finance|  3300|3900|
|    Jen|  Finance|  3900|null|
|  Kumar|Marketing|  2000|3000|
|   Jeff|Marketing|  3000|null|
+-------+---------+------+----+



In [27]:
# aggregate functions, min, max, sum, count, avg

from pyspark.sql.window import Window
from pyspark.sql.functions import avg, sum, min, max, count, col

windowSpec = Window.partitionBy("dept")

df = empDf\
          .withColumn("min", min(col("salary")).over(windowSpec))\
          .withColumn("max", max(col("salary")).over(windowSpec))\
          .withColumn("avg", avg(col("salary")).over(windowSpec))\
          .withColumn("count", count(col("salary")).over(windowSpec))\
          .withColumn("sum", sum(col("salary")).over(windowSpec))

df.show()



+-------+---------+------+----+----+------------------+-----+-----+
|   name|     dept|salary| min| max|               avg|count|  sum|
+-------+---------+------+----+----+------------------+-----+-----+
|  James|    Sales|  3000|3000|4600|3857.1428571428573|    7|27000|
|Michael|    Sales|  4600|3000|4600|3857.1428571428573|    7|27000|
| Robert|    Sales|  4100|3000|4600|3857.1428571428573|    7|27000|
|  James|    Sales|  3000|3000|4600|3857.1428571428573|    7|27000|
|   Saif|    Sales|  4100|3000|4600|3857.1428571428573|    7|27000|
|    Joe|    Sales|  4200|3000|4600|3857.1428571428573|    7|27000|
| Venkat|    Sales|  4000|3000|4600|3857.1428571428573|    7|27000|
|  Maria|  Finance|  3000|3000|3900|            3400.0|    3|10200|
|  Scott|  Finance|  3300|3000|3900|            3400.0|    3|10200|
|    Jen|  Finance|  3900|3000|3900|            3400.0|    3|10200|
|   Jeff|Marketing|  3000|2000|3000|            2500.0|    2| 5000|
|  Kumar|Marketing|  2000|2000|3000|            

                                                                                