In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkApp-DF-Imp-Functions") \
    .getOrCreate()

In [2]:
spark

In [24]:
#Creating a dummy dataframe
df = spark.range(1)
df.show()

+---+
| id|
+---+
|  0|
+---+



### Date and Time Functions

#### current_date()

In [25]:
from pyspark.sql.functions import current_date

df.select(current_date()).show()
df.select(current_date()).printSchema()

+--------------+
|current_date()|
+--------------+
|    2024-12-02|
+--------------+

root
 |-- current_date(): date (nullable = false)



#### to_date()
Converts the string type date in to date type.

In [26]:
from pyspark.sql.functions import lit
df.select(lit("2024-12-02")).printSchema()

root
 |-- 2024-12-02: string (nullable = false)



In [28]:
from pyspark.sql.functions import to_date, lit

format_date_df = df.select(to_date(col=lit("2024-12-02"), format="yyyy-MM-dd").alias("formatted_date"))
format_date_df.show()
format_date_df.printSchema()

+--------------+
|formatted_date|
+--------------+
|    2024-12-02|
+--------------+

root
 |-- formatted_date: date (nullable = true)



#### date_format()
Returns the new date format (in string type) as specified.

In [29]:
from pyspark.sql.functions import date_format, lit

df.select(date_format(date=lit("2024-12-02"), format="MM/dd/yyyy").alias("formatted_date")).show()

+--------------+
|formatted_date|
+--------------+
|    12/02/2024|
+--------------+



In [35]:
df = format_date_df.select(date_format("formatted_date", "MM/dd/yyyy").alias("new_format"))
df.show()
df.printSchema()

+----------+
|new_format|
+----------+
|12/02/2024|
+----------+

root
 |-- new_format: string (nullable = true)



#### Periods

In [43]:
data = [("2024-12-01", "2024-12-14"), ("2024-12-01", "2024-07-17")]

df = spark.createDataFrame(data=data, schema=["d1", "d2"])
df.show()

+----------+----------+
|        d1|        d2|
+----------+----------+
|2024-12-01|2024-12-14|
|2024-12-01|2024-07-17|
+----------+----------+



In [44]:
from pyspark.sql.functions import datediff

df.select("d1", "d2", datediff(start="d1", end="d2").alias("datediff")).show()

+----------+----------+--------+
|        d1|        d2|datediff|
+----------+----------+--------+
|2024-12-01|2024-12-14|      13|
|2024-12-01|2024-07-17|    -137|
+----------+----------+--------+



In [48]:
from pyspark.sql.functions import date_add

df.select("d2", date_add(start="d2", days=10).alias("add_d2_days")).show()

+----------+-----------+
|        d2|add_d2_days|
+----------+-----------+
|2024-12-14| 2024-12-24|
|2024-07-17| 2024-07-27|
+----------+-----------+



In [51]:
from pyspark.sql.functions import date_sub

df.select("d2", date_sub(start="d2", days=10).alias("d2_date_sub")).show()

+----------+-----------+
|        d2|d2_date_sub|
+----------+-----------+
|2024-12-14| 2024-12-04|
|2024-07-17| 2024-07-07|
+----------+-----------+



In [49]:
from pyspark.sql.functions import months_between

df.select("d1", "d2", months_between(date1="d1", date2="d2").alias("months_between")).show()

+----------+----------+--------------+
|        d1|        d2|months_between|
+----------+----------+--------------+
|2024-12-01|2024-12-14|   -0.41935484|
|2024-12-01|2024-07-17|    4.48387097|
+----------+----------+--------------+



In [None]:
from pyspark.sql.functions import add_months

df.select("d2", add_months(start="d2", months=5).alias("add_d2_months")).show()

+----------+-------------+
|        d2|add_d2_months|
+----------+-------------+
|2024-12-14|   2025-05-14|
|2024-07-17|   2024-12-17|
+----------+-------------+



#### Timestamp

In [53]:
df = spark.range(1)

In [55]:
from pyspark.sql.functions import current_timestamp

df.select(current_timestamp()).show(truncate=False)

+--------------------------+
|current_timestamp()       |
+--------------------------+
|2024-12-02 11:28:03.794376|
+--------------------------+



In [59]:
df = df.select(lit("2024-12-02 11:28:03"))
df.printSchema()

root
 |-- 2024-12-02 11:28:03: string (nullable = false)



In [62]:
from pyspark.sql.functions import to_timestamp

df = df.select(to_timestamp(lit("2024-12-02 11:28:03")).alias("timestamp"))
df.show(truncate=False)
df.printSchema()

+-------------------+
|timestamp          |
+-------------------+
|2024-12-02 11:28:03|
+-------------------+

root
 |-- timestamp: timestamp (nullable = true)



In [None]:
from pyspark.sql.functions import day, year, month

df.select(year("timestamp")).show(truncate=False)
df.select(month("timestamp")).show(truncate=False)
df.select(day("timestamp")).show(truncate=False)

+---------------+
|year(timestamp)|
+---------------+
|2024           |
+---------------+

+----------------+
|month(timestamp)|
+----------------+
|12              |
+----------------+

+--------------+
|day(timestamp)|
+--------------+
|2             |
+--------------+



In [66]:
from pyspark.sql.functions import second

df.select(second("timestamp")).show(truncate=False)

+-----------------+
|second(timestamp)|
+-----------------+
|3                |
+-----------------+



### Aggregate Functions
- count, count_distinct
- sum, avg
- min, max
- collect_list and collect_set

In [67]:
df = spark.read.csv("resources/in/employee/employee_data_1.csv", header=True, inferSchema=True)
df.show()

+---+--------------+-----------+------+
| ID|          Name| Department|Salary|
+---+--------------+-----------+------+
|  1|      John Doe|Engineering| 50000|
|  2|    Jane Smith|  Marketing| 45000|
|  3|     Jim Brown|      Sales| 40000|
|  4|  Jackie White|         HR| 42000|
|  5|   Emily Davis|Engineering| 60000|
|  6| Michael Scott| Management| 75000|
|  7|    Pam Beesly|  Reception| 35000|
|  8|Dwight Schrute|      Sales| 50000|
|  9| Angela Martin| Accounting| 48000|
| 10|  Kevin Malone| Accounting| 45000|
| 11|Oscar Martinez| Accounting| 47000|
| 12|Stanley Hudson|      Sales| 46000|
+---+--------------+-----------+------+



In [77]:
from pyspark.sql.functions import count

df.select(count(col="ID").alias("Total_No_Of_Employees")).show()

+---------------------+
|Total_No_Of_Employees|
+---------------------+
|                   12|
+---------------------+



In [81]:
from pyspark.sql.functions import count_distinct

df.select(count_distinct(col="Department").alias("Total_Departments")).show()

+-----------------+
|Total_Departments|
+-----------------+
|                7|
+-----------------+



In [78]:
from pyspark.sql.functions import avg, min, max

df.select(avg(col="Salary").alias("Average_Salary")).show()
df.select(min(col="Salary").alias("Minimum_Salary")).show()
df.select(max(col="Salary").alias("Maximum_Salary")).show()

+------------------+
|    Average_Salary|
+------------------+
|48583.333333333336|
+------------------+

+--------------+
|Minimum_Salary|
+--------------+
|         35000|
+--------------+

+--------------+
|Maximum_Salary|
+--------------+
|         75000|
+--------------+



In [79]:
from pyspark.sql.functions import sum

df.select(sum(col="Salary").alias("Total_Budget")).show()

+------------+
|Total_Budget|
+------------+
|      583000|
+------------+



#### collect_list()
Return all values of a column including duplicates

In [None]:
from pyspark.sql.functions import collect_list

df.select(collect_list(col="Name").alias("All_Employees")).show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|All_Employees                                                                                                                                                       |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[John Doe, Jane Smith, Jim Brown, Jackie White, Emily Davis, Michael Scott, Pam Beesly, Dwight Schrute, Angela Martin, Kevin Malone, Oscar Martinez, Stanley Hudson]|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+



#### collect_set()
Return only distinct values of a column

In [75]:
from pyspark.sql.functions import collect_set

df.select(collect_set(col="Department").alias("Departments")).show(truncate=False)

+----------------------------------------------------------------------+
|Departments                                                           |
+----------------------------------------------------------------------+
|[Accounting, Management, Reception, HR, Sales, Marketing, Engineering]|
+----------------------------------------------------------------------+



### Window Functions

Window functions are applied on window/subset of records. Data is partioned and ordered by the column used for patitioning.

- rank - Assigns rank value from 1. Skips the rank if there are ties or same value.
- dense_rank - Assigns rank value from 1. Does not skip the rank even if there are ties or same value.
- row_number: Assigns values sequentially from 1 to n.

In [87]:
data = [
    (1, "Paul", 32000, "IT"), 
    (2, "Angela", 45000, "HR"), 
    (3, "John", 38000, "IT"), 
    (4, "Micheal", 36000, "HR"), 
    (5, "Oscar", 34000, "HR"), 
    (6, "Emily", 32000, "IT"),
    (7, "Pam", 24000, "Reception"),
    (8, "Dwight", 42000, "IT")
]

df = spark.createDataFrame(data=data, schema=["id", "name", "salary", "department"])
df.show()

+---+-------+------+----------+
| id|   name|salary|department|
+---+-------+------+----------+
|  1|   Paul| 32000|        IT|
|  2| Angela| 45000|        HR|
|  3|   John| 38000|        IT|
|  4|Micheal| 36000|        HR|
|  5|  Oscar| 34000|        HR|
|  6|  Emily| 32000|        IT|
|  7|    Pam| 24000| Reception|
|  8| Dwight| 42000|        IT|
+---+-------+------+----------+



In [None]:
from pyspark.sql.window import Window

window = Window.partitionBy("department").orderBy("salary")

In [90]:
from pyspark.sql.functions import row_number

df.withColumn("row_number", row_number().over(window)).show()

+---+-------+------+----------+----------+
| id|   name|salary|department|row_number|
+---+-------+------+----------+----------+
|  5|  Oscar| 34000|        HR|         1|
|  4|Micheal| 36000|        HR|         2|
|  2| Angela| 45000|        HR|         3|
|  1|   Paul| 32000|        IT|         1|
|  6|  Emily| 32000|        IT|         2|
|  3|   John| 38000|        IT|         3|
|  8| Dwight| 42000|        IT|         4|
|  7|    Pam| 24000| Reception|         1|
+---+-------+------+----------+----------+



In [91]:
from pyspark.sql.functions import rank

df.withColumn("rank", rank().over(window)).show()

+---+-------+------+----------+----+
| id|   name|salary|department|rank|
+---+-------+------+----------+----+
|  5|  Oscar| 34000|        HR|   1|
|  4|Micheal| 36000|        HR|   2|
|  2| Angela| 45000|        HR|   3|
|  1|   Paul| 32000|        IT|   1|
|  6|  Emily| 32000|        IT|   1|
|  3|   John| 38000|        IT|   3|
|  8| Dwight| 42000|        IT|   4|
|  7|    Pam| 24000| Reception|   1|
+---+-------+------+----------+----+



In [92]:
from pyspark.sql.functions import dense_rank

df.withColumn("rank", dense_rank().over(window)).show()

+---+-------+------+----------+----+
| id|   name|salary|department|rank|
+---+-------+------+----------+----+
|  5|  Oscar| 34000|        HR|   1|
|  4|Micheal| 36000|        HR|   2|
|  2| Angela| 45000|        HR|   3|
|  1|   Paul| 32000|        IT|   1|
|  6|  Emily| 32000|        IT|   1|
|  3|   John| 38000|        IT|   2|
|  8| Dwight| 42000|        IT|   3|
|  7|    Pam| 24000| Reception|   1|
+---+-------+------+----------+----+



In [93]:
spark.stop()