In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession \
    .builder \
    .getOrCreate()

Create Dataframe

In [14]:
### emp1 DataFrame
data =(('Alicia','Joseph',['Java','Scala','Spark'],{'hair':'black','eye':'brown'}), \
('Robert','Gee',['Spark','Java'],{'hair':'brown','eye':None}),\
('Mike','Bianca',['CSharp',''],{'hair':'red','eye':''}),\
('John','Kumar',None,None),\
('Jeff','L',['1','2'],{}))
schema = ('FirstName','LastName','Languages','properties')
emp1 = spark.createDataFrame(data=data,schema=schema)

### emp2 DataFrame
data=(('Robert',35,40,40),('Ram',31,33,29),('John',95,89,91))
schema = ('name','score1','score2','score3')
emp2= spark.createDataFrame(data=data, schema=schema)

### emp3 DataFrame
emp3 =spark.createDataFrame(data=(('John',(10,20,20),(25,11,10)),('Robert',(15,13,55),(5,None,29)),('James',(11,13,45),(5,89,79))),schema=('empName', 'score_arr1', 'score_arr2'))

### df DataFrame
df = spark.sql("SELECT array(struct(1, 'a'), struct(2, 'b')) as data")

size(col):
- Returns the length of the array or map stored in the column.
- Size is -1 for null elements.

In [18]:
emp1.select('FirstName',size('Languages'), size('properties')).show()

+---------+---------------+----------------+
|FirstName|size(Languages)|size(properties)|
+---------+---------------+----------------+
|   Alicia|              3|               2|
|   Robert|              2|               2|
|     Mike|              2|               2|
|     John|             -1|              -1|
|     Jeff|              2|               0|
+---------+---------------+----------------+



element_at(col,extraction):
- Returns element of array at given index in extraction if col is array.
- Returns value for the given key in extraction if col is map.

In [30]:
emp1.select('FirstName',
            element_at('Languages',2),
            col('Languages')[0], #Similar element_at
            element_at('properties','eye'),
            col('properties.eye')#Similar element_at
            )\
.show()

+---------+------------------------+------------+---------------------------+-----+
|FirstName|element_at(Languages, 2)|Languages[0]|element_at(properties, eye)|  eye|
+---------+------------------------+------------+---------------------------+-----+
|   Alicia|                   Scala|        Java|                      brown|brown|
|   Robert|                    Java|       Spark|                       null| null|
|     Mike|                        |      CSharp|                           |     |
|     John|                    null|        null|                       null| null|
|     Jeff|                       2|           1|                       null| null|
+---------+------------------------+------------+---------------------------+-----+



array(*cols):
- Creates a new array column.

array_max(col), array_min(col):
- Returns maximum or minimum values of an array column.

array_distinct(col):
- Returns distinct values of an array column.

array_repeat(col,count):
- Repeated count times.

In [36]:
emp2\
    .withColumn('array_elements', array('score1', 'score2', 'score3'))\
    .withColumn('array_max', array_max('array_elements'))\
    .withColumn('array_distinct', array_distinct('array_elements'))\
    .withColumn('array_repeat', array_repeat('array_elements',3))\
    .show(truncate=False)

+------+------+------+------+--------------+---------+--------------+------------------------------------------+
|name  |score1|score2|score3|array_elements|array_max|array_distinct|array_repeat                              |
+------+------+------+------+--------------+---------+--------------+------------------------------------------+
|Robert|35    |40    |40    |[35, 40, 40]  |40       |[35, 40]      |[[35, 40, 40], [35, 40, 40], [35, 40, 40]]|
|Ram   |31    |33    |29    |[31, 33, 29]  |33       |[31, 33, 29]  |[[31, 33, 29], [31, 33, 29], [31, 33, 29]]|
|John  |95    |89    |91    |[95, 89, 91]  |95       |[95, 89, 91]  |[[95, 89, 91], [95, 89, 91], [95, 89, 91]]|
+------+------+------+------+--------------+---------+--------------+------------------------------------------+



slice(col,start,length)
- Returns an array containing all the elements in `col` from index `start` for length `length`.
- col is Array Type.

array_position(col,value):
- Locates the position of the first occurrence of the given value in the given array.
- Starts with Index 1.

array_remove(col,element):
- Remove all elements that equal to element from the given array.

array_sort(col):
- Sorts the input array in ascending order.
- The elements of the input array must be orderable.
- Null elements will be placed at the end of the returned array.

sort_array(col,asc=True):
- Sorts the input array in ascending or descending order according to the natural ordering of the array elements.
- Null elements will be placed at the beginning of the returned array in ascending order or at the end of the returned array in
descending order.


In [49]:
emp1\
    .withColumn('slice_array', slice('Languages', 3, 1))\
    .withColumn('array_position', array_position('Languages','Spark'))\
    .withColumn('array_remove', array_remove('Languages', 'Spark'))\
    .withColumn('array_sort', array_sort('Languages'))\
    .withColumn('sort_array', sort_array('Languages', asc=False))\
    .show(truncate=False)

+---------+--------+--------------------+-----------------------------+-----------+--------------+-------------+--------------------+--------------------+
|FirstName|LastName|Languages           |properties                   |slice_array|array_position|array_remove |array_sort          |sort_array          |
+---------+--------+--------------------+-----------------------------+-----------+--------------+-------------+--------------------+--------------------+
|Alicia   |Joseph  |[Java, Scala, Spark]|{eye -> brown, hair -> black}|[Spark]    |3             |[Java, Scala]|[Java, Scala, Spark]|[Spark, Scala, Java]|
|Robert   |Gee     |[Spark, Java]       |{eye -> null, hair -> brown} |[]         |1             |[Java]       |[Java, Spark]       |[Spark, Java]       |
|Mike     |Bianca  |[CSharp, ]          |{eye -> , hair -> red}       |[]         |0             |[CSharp, ]   |[, CSharp]          |[CSharp, ]          |
|John     |Kumar   |null                |null                         