In [2]:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [3]:
sparkConf =  SparkConf().setAppName("Trying").setMaster("local")
sc = SparkContext(conf=sparkConf)

In [4]:
spark = SparkSession.builder\
        .appName('yettodecide')\
        .master('local')\
        .getOrCreate()

### Date/Time Functions

In [3]:
df_1 = spark.createDataFrame([('2021-01-15', '2021-02-15',)], ['start_dt', 'end_dt'])
df_1.show()
df_1.printSchema()

+----------+----------+
|  start_dt|    end_dt|
+----------+----------+
|2021-01-15|2021-02-15|
+----------+----------+

root
 |-- start_dt: string (nullable = true)
 |-- end_dt: string (nullable = true)



In [13]:
from pyspark.sql.functions import date_format

df_2 = df_1.select("start_dt", "end_dt", date_format("start_dt",'dd/MM/yyyy').alias("dt_format"))
df_2.show()
df_2.printSchema()

+----------+----------+----------+
|  start_dt|    end_dt| dt_format|
+----------+----------+----------+
|2021-01-15|2021-02-15|15/01/2021|
+----------+----------+----------+

root
 |-- start_dt: string (nullable = true)
 |-- end_dt: string (nullable = true)
 |-- dt_format: string (nullable = true)



In [22]:
from pyspark.sql.functions import current_date, date_add , date_sub, datediff, add_months

df_2.select("start_dt", "end_dt", current_date().alias("cur_dt")).show()
df_2.select("start_dt", "end_dt", date_add("start_dt", 2).alias("add_2_days")).show() 
df_2.select("start_dt", "end_dt", date_sub("start_dt", 2).alias("sub_2_days")).show() 
df_2.select("start_dt", "end_dt", datediff("end_dt", "start_dt").alias("sub_2_dates")).show()
df_2.select("start_dt", "end_dt", add_months("start_dt", 2).alias("add_2_months")).show()
df_2.select("start_dt", "end_dt", add_months("start_dt", 2 * 12).alias("add_2_Yrs")).show() 

+----------+----------+----------+
|  start_dt|    end_dt|    cur_dt|
+----------+----------+----------+
|2021-01-15|2021-02-15|2022-01-27|
+----------+----------+----------+

+----------+----------+----------+
|  start_dt|    end_dt|add_2_days|
+----------+----------+----------+
|2021-01-15|2021-02-15|2021-01-17|
+----------+----------+----------+

+----------+----------+----------+
|  start_dt|    end_dt|sub_2_days|
+----------+----------+----------+
|2021-01-15|2021-02-15|2021-01-13|
+----------+----------+----------+

+----------+----------+-----------+
|  start_dt|    end_dt|sub_2_dates|
+----------+----------+-----------+
|2021-01-15|2021-02-15|         31|
+----------+----------+-----------+

+----------+----------+------------+
|  start_dt|    end_dt|add_2_months|
+----------+----------+------------+
|2021-01-15|2021-02-15|  2021-03-15|
+----------+----------+------------+

+----------+----------+----------+
|  start_dt|    end_dt| add_2_Yrs|
+----------+----------+----------+


In [39]:
from pyspark.sql.functions import year, month, dayofmonth, weekofyear, dayofweek, dayofyear,last_day,months_between,next_day,quarter,trunc

In [25]:
df_2.select("start_dt", "end_dt", year("start_dt").alias("Year")
 , month("start_dt").alias("Month")
 , dayofmonth("start_dt").alias("Day")
 , weekofyear("start_dt").alias("Week_of_Year")
 , dayofweek("start_dt").alias("Day_of_Week")
 , dayofyear("start_dt").alias("Day_of_Year")).show() 

+----------+----------+----+-----+---+------------+-----------+-----------+
|  start_dt|    end_dt|Year|Month|Day|Week_of_Year|Day_of_Week|Day_of_Year|
+----------+----------+----+-----+---+------------+-----------+-----------+
|2021-01-15|2021-02-15|2021|    1| 15|           2|          6|         15|
+----------+----------+----+-----+---+------------+-----------+-----------+



In [28]:
df_2.select("start_dt", "end_dt", last_day("start_dt").alias("Last_Day")).show() 

+----------+----------+----------+
|  start_dt|    end_dt|  Last_Day|
+----------+----------+----------+
|2021-01-15|2021-02-15|2021-01-31|
+----------+----------+----------+



In [31]:
df_2.select("start_dt", "end_dt",
 months_between("end_dt", "start_dt").alias("Months_Betwn")).show()

+----------+----------+------------+
|  start_dt|    end_dt|Months_Betwn|
+----------+----------+------------+
|2021-01-15|2021-02-15|         1.0|
+----------+----------+------------+



In [34]:
df_2.select("start_dt", "end_dt", next_day("start_dt", "Mon").alias("Next_Monday")).show()

+----------+----------+-----------+
|  start_dt|    end_dt|Next_Monday|
+----------+----------+-----------+
|2021-01-15|2021-02-15| 2021-01-18|
+----------+----------+-----------+



In [37]:
df_2.select("start_dt", "end_dt", quarter("start_dt").alias("Quarter_of_Year")).show() 

+----------+----------+---------------+
|  start_dt|    end_dt|Quarter_of_Year|
+----------+----------+---------------+
|2021-01-15|2021-02-15|              1|
+----------+----------+---------------+



In [40]:
 ### Truncate Date to Year, Month
df_2.select("start_dt", "end_dt", trunc("start_dt", "year").alias("Trunc_Year"),
trunc("end_dt", "month").alias("Trunc_Month")).show()    

+----------+----------+----------+-----------+
|  start_dt|    end_dt|Trunc_Year|Trunc_Month|
+----------+----------+----------+-----------+
|2021-01-15|2021-02-15|2021-01-01| 2021-02-01|
+----------+----------+----------+-----------+



## Aggregate Functions

In [42]:
df_2.select(df_2["start_dt"]).collect()

[Row(start_dt='2021-01-15')]

#### approx_count_distinct()
 function returns the count of distinct items in a group<br>
approxDistinctCount = agg_df.select(approx_count_distinct("salary")) <br>
approxDistinctCount.show() 

#### avg (average)
avgSal = agg_df.select(avg("salary"))  #  sum, min, max <br>
avgSal.show() 

### collect_list
function returns all values from an input column with duplicates <br>
agg_df.select(collect_list("salary")).show(truncate=False)  <br>

----

### collect_set
collect_set ( ) function returns all values from an input column with NO duplicate
values.<br>

agg_df.select(collect_set("salary")).show(truncate=False) 

----

### countDistinct
countDistinct ( ) function returns the number of distinct elements in a columns. 

df2 = agg_df.select(countDistinct("department", "salary"))
df2.show(truncate=False) 

----

### count
cnt = agg_df.count()<br>
print(cnt) 

----

###  first/last


agg_df.select(first("salary")).show(truncate=False)<br>
agg_df.select(last("salary")).show(truncate=False) 

----

### sumDistinct

agg_df.select(sumDistinct("salary")).show(truncate=False)




## Windows Function

<ul>
    <li>ranking functions</li>
    <li>analytic functions</li>
    <li>aggregate functions</li>
</ul>

<pre>
    windowSpec = Window.partitionBy("department").orderBy("salary") 
</pre>

In [19]:
simpleData = (("James", "Sales", 3000), \
    ("Michael", "Sales", 4600),  \
    ("Robert", "Sales", 4100),   \
    ("Maria", "Finance", 3000),  \
    ("James", "Sales", 3000),    \
    ("Scott", "Finance", 3300),  \
    ("Jen", "Finance", 3900),    \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000),\
    ("Saif", "Sales", 4100) \
  )
 
columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [22]:
from pyspark.sql.window import Window

windowSpec = Window.partitionBy("department").orderBy("salary")

In [23]:
# row_number

from pyspark.sql.functions import row_number

df.withColumn("row_number", row_number().over(windowSpec)).show(truncate=False) 

+-------------+----------+------+----------+
|employee_name|department|salary|row_number|
+-------------+----------+------+----------+
|James        |Sales     |3000  |1         |
|James        |Sales     |3000  |2         |
|Robert       |Sales     |4100  |3         |
|Saif         |Sales     |4100  |4         |
|Michael      |Sales     |4600  |5         |
|Maria        |Finance   |3000  |1         |
|Scott        |Finance   |3300  |2         |
|Jen          |Finance   |3900  |3         |
|Kumar        |Marketing |2000  |1         |
|Jeff         |Marketing |3000  |2         |
+-------------+----------+------+----------+



In [26]:
# rank
from pyspark.sql.functions import rank

df.withColumn("rank", rank().over(windowSpec)).show()

+-------------+----------+------+----+
|employee_name|department|salary|rank|
+-------------+----------+------+----+
|        James|     Sales|  3000|   1|
|        James|     Sales|  3000|   1|
|       Robert|     Sales|  4100|   3|
|         Saif|     Sales|  4100|   3|
|      Michael|     Sales|  4600|   5|
|        Maria|   Finance|  3000|   1|
|        Scott|   Finance|  3300|   2|
|          Jen|   Finance|  3900|   3|
|        Kumar| Marketing|  2000|   1|
|         Jeff| Marketing|  3000|   2|
+-------------+----------+------+----+



In [28]:
# dense_rank
from pyspark.sql.functions import dense_rank

df.withColumn("dense_rank", dense_rank().over(windowSpec)).show()

+-------------+----------+------+----------+
|employee_name|department|salary|dense_rank|
+-------------+----------+------+----------+
|        James|     Sales|  3000|         1|
|        James|     Sales|  3000|         1|
|       Robert|     Sales|  4100|         2|
|         Saif|     Sales|  4100|         2|
|      Michael|     Sales|  4600|         3|
|        Maria|   Finance|  3000|         1|
|        Scott|   Finance|  3300|         2|
|          Jen|   Finance|  3900|         3|
|        Kumar| Marketing|  2000|         1|
|         Jeff| Marketing|  3000|         2|
+-------------+----------+------+----------+



In [29]:
# ntile
from pyspark.sql.functions import ntile


df.withColumn("ntile", ntile(2).over(windowSpec)).show()

+-------------+----------+------+-----+
|employee_name|department|salary|ntile|
+-------------+----------+------+-----+
|        James|     Sales|  3000|    1|
|        James|     Sales|  3000|    1|
|       Robert|     Sales|  4100|    1|
|         Saif|     Sales|  4100|    2|
|      Michael|     Sales|  4600|    2|
|        Maria|   Finance|  3000|    1|
|        Scott|   Finance|  3300|    1|
|          Jen|   Finance|  3900|    2|
|        Kumar| Marketing|  2000|    1|
|         Jeff| Marketing|  3000|    2|
+-------------+----------+------+-----+



In [31]:
#lag
from pyspark.sql.functions import lag,lead

df.withColumn("lag", lag("salary", 2).over(windowSpec)).show()
df.withColumn("lead", lead("salary", 2).over(windowSpec)).show()

+-------------+----------+------+----+
|employee_name|department|salary| lag|
+-------------+----------+------+----+
|        James|     Sales|  3000|null|
|        James|     Sales|  3000|null|
|       Robert|     Sales|  4100|3000|
|         Saif|     Sales|  4100|3000|
|      Michael|     Sales|  4600|4100|
|        Maria|   Finance|  3000|null|
|        Scott|   Finance|  3300|null|
|          Jen|   Finance|  3900|3000|
|        Kumar| Marketing|  2000|null|
|         Jeff| Marketing|  3000|null|
+-------------+----------+------+----+

+-------------+----------+------+----+
|employee_name|department|salary|lead|
+-------------+----------+------+----+
|        James|     Sales|  3000|4100|
|        James|     Sales|  3000|4100|
|       Robert|     Sales|  4100|4600|
|         Saif|     Sales|  4100|null|
|      Michael|     Sales|  4600|null|
|        Maria|   Finance|  3000|3900|
|        Scott|   Finance|  3300|null|
|          Jen|   Finance|  3900|null|
|        Kumar| Marketin

## Window Aggregate Functions

In [36]:
from pyspark.sql.window import Window
from pyspark.sql.functions import avg,col,sum,min,max

windowSpecAgg = Window.partitionBy("department")

df.withColumn("row", row_number().over(windowSpec)) \
 .withColumn("avg", avg(col("salary")).over(windowSpecAgg)) \
 .withColumn("sum", sum(col("salary")).over(windowSpecAgg)) \
 .withColumn("min", min(col("salary")).over(windowSpecAgg)) \
 .withColumn("max", max(col("salary")).over(windowSpecAgg)) \
 .where(col("row") == 1).select("department", "avg", "sum", "min", "max") \
 .show() 

+----------+------+-----+----+----+
|department|   avg|  sum| min| max|
+----------+------+-----+----+----+
|     Sales|3760.0|18800|3000|4600|
|   Finance|3400.0|10200|3000|3900|
| Marketing|2500.0| 5000|2000|3000|
+----------+------+-----+----+----+

