In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession \
    .builder \
    .getOrCreate()

Create Dataframe

In [5]:
data = (("James","Sales","NY",9000,34),
("Alicia","Sales","NY",8600,56),
("Robert","Sales","CA",8100,30),
("John","Sales","AZ",8600,30),
("Ross","Sales","AZ",8100,33),
("Khaty","Sales","AZ",1000,39),
("Lisa","Finance","CA",9000,24),
("Deja","Finance","CA",9900,40),
("Sugie","Finance","NY",8300,36),
("Ram","Finance","NY",7900,53),
("Sugie","Finance","NY",8300,36),
("Kyle","Marketing","CA",8000,25),
("Reid","Marketing","NY",9100,50))

schema=("empname", "dept", "state","salary","age")
df = spark.createDataFrame (data,schema =schema)
df.show()

+-------+---------+-----+------+---+
|empname|     dept|state|salary|age|
+-------+---------+-----+------+---+
|  James|    Sales|   NY|  9000| 34|
| Alicia|    Sales|   NY|  8600| 56|
| Robert|    Sales|   CA|  8100| 30|
|   John|    Sales|   AZ|  8600| 30|
|   Ross|    Sales|   AZ|  8100| 33|
|  Khaty|    Sales|   AZ|  1000| 39|
|   Lisa|  Finance|   CA|  9000| 24|
|   Deja|  Finance|   CA|  9900| 40|
|  Sugie|  Finance|   NY|  8300| 36|
|    Ram|  Finance|   NY|  7900| 53|
|  Sugie|  Finance|   NY|  8300| 36|
|   Kyle|Marketing|   CA|  8000| 25|
|   Reid|Marketing|   NY|  9100| 50|
+-------+---------+-----+------+---+



monotonically_increasing_id()

ps: unique id but not sequencial

In [27]:
df.withColumn("id", monotonically_increasing_id()).show()

+-------+---------+-----+------+---+-----------+
|empname|     dept|state|salary|age|         id|
+-------+---------+-----+------+---+-----------+
|  James|    Sales|   NY|  9000| 34|          0|
| Alicia|    Sales|   NY|  8600| 56| 8589934592|
| Robert|    Sales|   CA|  8100| 30|17179869184|
|   John|    Sales|   AZ|  8600| 30|25769803776|
|   Ross|    Sales|   AZ|  8100| 33|34359738368|
|  Khaty|    Sales|   AZ|  1000| 39|42949672960|
|   Lisa|  Finance|   CA|  9000| 24|51539607552|
|   Deja|  Finance|   CA|  9900| 40|60129542144|
|  Sugie|  Finance|   NY|  8300| 36|68719476736|
|    Ram|  Finance|   NY|  7900| 53|77309411328|
|  Sugie|  Finance|   NY|  8300| 36|85899345920|
|   Kyle|Marketing|   CA|  8000| 25|94489280512|
|   Reid|Marketing|   NY|  9100| 50|94489280513|
+-------+---------+-----+------+---+-----------+



lit()
- Create static column

In [43]:
df.withColumn("country", lit("USA")).show()

+-------+---------+-----+------+---+-------+
|empname|     dept|state|salary|age|country|
+-------+---------+-----+------+---+-------+
|  James|    Sales|   NY|  9000| 34|    USA|
| Alicia|    Sales|   NY|  8600| 56|    USA|
| Robert|    Sales|   CA|  8100| 30|    USA|
|   John|    Sales|   AZ|  8600| 30|    USA|
|   Ross|    Sales|   AZ|  8100| 33|    USA|
|  Khaty|    Sales|   AZ|  1000| 39|    USA|
|   Lisa|  Finance|   CA|  9000| 24|    USA|
|   Deja|  Finance|   CA|  9900| 40|    USA|
|  Sugie|  Finance|   NY|  8300| 36|    USA|
|    Ram|  Finance|   NY|  7900| 53|    USA|
|  Sugie|  Finance|   NY|  8300| 36|    USA|
|   Kyle|Marketing|   CA|  8000| 25|    USA|
|   Reid|Marketing|   NY|  9100| 50|    USA|
+-------+---------+-----+------+---+-------+



concat()

In [47]:
df.select(concat("salary", lit("|"), "age").alias("concat_salary_age")).show(1)

+-----------------+
|concat_salary_age|
+-----------------+
|          9000|34|
+-----------------+
only showing top 1 row



expr(str)

In [72]:
df.withColumn("empname_len", expr("length(empname)"))\
    .withColumn("emp-dept", expr("empname || '-' || dept"))\
    .withColumn("age_desc",expr("CASE WHEN age > 50 then 'SENIOR' else 'ADULT' END"))\
    .withColumn("age_10",expr("age + 10 as age_10"))\
    .show()

+-------+---------+-----+------+---+-----------+--------------+--------+------+
|empname|     dept|state|salary|age|empname_len|      emp-dept|age_desc|age_10|
+-------+---------+-----+------+---+-----------+--------------+--------+------+
|  James|    Sales|   NY|  9000| 34|          5|   James-Sales|   ADULT|    44|
| Alicia|    Sales|   NY|  8600| 56|          6|  Alicia-Sales|  SENIOR|    66|
| Robert|    Sales|   CA|  8100| 30|          6|  Robert-Sales|   ADULT|    40|
|   John|    Sales|   AZ|  8600| 30|          4|    John-Sales|   ADULT|    40|
|   Ross|    Sales|   AZ|  8100| 33|          4|    Ross-Sales|   ADULT|    43|
|  Khaty|    Sales|   AZ|  1000| 39|          5|   Khaty-Sales|   ADULT|    49|
|   Lisa|  Finance|   CA|  9000| 24|          4|  Lisa-Finance|   ADULT|    34|
|   Deja|  Finance|   CA|  9900| 40|          4|  Deja-Finance|   ADULT|    50|
|  Sugie|  Finance|   NY|  8300| 36|          5| Sugie-Finance|   ADULT|    46|
|    Ram|  Finance|   NY|  7900| 53|    

spark_partition_id()

In [76]:
df.rdd.getNumPartitions()
df.withColumn("partition_id", spark_partition_id()).show()

+-------+---------+-----+------+---+------------+
|empname|     dept|state|salary|age|partition_id|
+-------+---------+-----+------+---+------------+
|  James|    Sales|   NY|  9000| 34|           0|
| Alicia|    Sales|   NY|  8600| 56|           1|
| Robert|    Sales|   CA|  8100| 30|           2|
|   John|    Sales|   AZ|  8600| 30|           3|
|   Ross|    Sales|   AZ|  8100| 33|           4|
|  Khaty|    Sales|   AZ|  1000| 39|           5|
|   Lisa|  Finance|   CA|  9000| 24|           6|
|   Deja|  Finance|   CA|  9900| 40|           7|
|  Sugie|  Finance|   NY|  8300| 36|           8|
|    Ram|  Finance|   NY|  7900| 53|           9|
|  Sugie|  Finance|   NY|  8300| 36|          10|
|   Kyle|Marketing|   CA|  8000| 25|          11|
|   Reid|Marketing|   NY|  9100| 50|          11|
+-------+---------+-----+------+---+------------+

