In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession \
    .builder \
    .getOrCreate()

Create Dataframe

In [8]:
df_ord = spark.read.csv(
    path="/home/phillipefs/spark_dev/pyspark-end-to-end-developer/0 - PracticeFiles/Orders", 
    schema="order_id INT, order_date STRING, order_customer_id INT, order_status STRING")
df_ord.show(3, truncate=False)

+--------+---------------------+-----------------+---------------+
|order_id|order_date           |order_customer_id|order_status   |
+--------+---------------------+-----------------+---------------+
|1       |2013-07-25 00:00:00.0|11599            |CLOSED         |
|2       |2013-07-25 00:00:00.0|256              |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00.0|12111            |COMPLETE       |
+--------+---------------------+-----------------+---------------+
only showing top 3 rows



split()

In [12]:
df_ord\
    .withColumn("split_date", split("order_date", '-'))\
    .withColumn("year", col("split_date")[0])\
    .show(3, truncate=False)

+--------+---------------------+-----------------+---------------+-------------------------+----+
|order_id|order_date           |order_customer_id|order_status   |split_date               |year|
+--------+---------------------+-----------------+---------------+-------------------------+----+
|1       |2013-07-25 00:00:00.0|11599            |CLOSED         |[2013, 07, 25 00:00:00.0]|2013|
|2       |2013-07-25 00:00:00.0|256              |PENDING_PAYMENT|[2013, 07, 25 00:00:00.0]|2013|
|3       |2013-07-25 00:00:00.0|12111            |COMPLETE       |[2013, 07, 25 00:00:00.0]|2013|
+--------+---------------------+-----------------+---------------+-------------------------+----+
only showing top 3 rows



In [14]:
df = spark.createDataFrame([('ab12cd23fe23',)],['s',])
df.select(split('s', '[0-9]+')).show()

+--------------------+
|split(s, [0-9]+, -1)|
+--------------------+
|      [ab, cd, fe, ]|
+--------------------+



length(col)

In [17]:
df_ord\
    .withColumn("size_status", length("order_status"))\
    .show(3)

+--------+--------------------+-----------------+---------------+-----------+
|order_id|          order_date|order_customer_id|   order_status|size_status|
+--------+--------------------+-----------------+---------------+-----------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|          6|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|         15|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|          8|
+--------+--------------------+-----------------+---------------+-----------+
only showing top 3 rows



lower(col), upper(col), initcap(col)

In [18]:
df_ord\
    .withColumn("lower_status", lower("order_status"))\
    .withColumn("upper_status", upper("order_status"))\
    .withColumn("initcap_status", initcap("order_status"))\
    .show(3)

+--------+--------------------+-----------------+---------------+---------------+---------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|   lower_status|   upper_status| initcap_status|
+--------+--------------------+-----------------+---------------+---------------+---------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|         closed|         CLOSED|         Closed|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|pending_payment|PENDING_PAYMENT|Pending_payment|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|       complete|       COMPLETE|       Complete|
+--------+--------------------+-----------------+---------------+---------------+---------------+---------------+
only showing top 3 rows



ltrim(col), rtrim(col), trim(col)

In [23]:
df = spark.createDataFrame([('     spark ',), ('     developer       ',)], schema=['col1'])
df.withColumn("l_trim", ltrim("col1"))\
    .withColumn("r_trim", rtrim("col1"))\
    .withColumn("trim", trim("col1"))\
    .show(truncate=False)

+---------------------+----------------+--------------+---------+
|col1                 |l_trim          |r_trim        |trim     |
+---------------------+----------------+--------------+---------+
|     spark           |spark           |     spark    |spark    |
|     developer       |developer       |     developer|developer|
+---------------------+----------------+--------------+---------+



lpad()

In [29]:
df_ord.withColumn("pading", lpad("order_customer_id", 10, '0')).show(3,truncate=False)

+--------+---------------------+-----------------+---------------+----------+
|order_id|order_date           |order_customer_id|order_status   |pading    |
+--------+---------------------+-----------------+---------------+----------+
|1       |2013-07-25 00:00:00.0|11599            |CLOSED         |0000011599|
|2       |2013-07-25 00:00:00.0|256              |PENDING_PAYMENT|0000000256|
|3       |2013-07-25 00:00:00.0|12111            |COMPLETE       |0000012111|
+--------+---------------------+-----------------+---------------+----------+
only showing top 3 rows



concat(*col) and concat_ws(sep, *cols)

In [34]:
df_ord\
    .withColumn("concat", concat(col("order_id"),col("order_status")))\
    .withColumn("concat_ws", concat_ws(" ",col("order_id"),col("order_status")))\
    .show(3)

+--------+--------------------+-----------------+---------------+----------------+-----------------+
|order_id|          order_date|order_customer_id|   order_status|          concat|        concat_ws|
+--------+--------------------+-----------------+---------------+----------------+-----------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|         1CLOSED|         1 CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|2PENDING_PAYMENT|2 PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|       3COMPLETE|       3 COMPLETE|
+--------+--------------------+-----------------+---------------+----------------+-----------------+
only showing top 3 rows



- substring(str,pos,len) and 
- substring_index(str, delim, count)
- instr(str, substr)
- locate(substr, str, pos=1)

In [59]:
df_ord\
    .withColumn("order_year", substring("order_date", 0, 4))\
    .withColumn("dummy", substring_index("order_date", "-",1))\
    .withColumn("instr", instr(col("order_status"), "O"))\
    .withColumn("locate", locate("0", col("order_date"),3))\
    .show(3)

+--------+--------------------+-----------------+---------------+----------+-----+-----+------+
|order_id|          order_date|order_customer_id|   order_status|order_year|dummy|instr|locate|
+--------+--------------------+-----------------+---------------+----------+-----+-----+------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|      2013| 2013|    3|     6|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|      2013| 2013|    0|     6|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|      2013| 2013|    2|     6|
+--------+--------------------+-----------------+---------------+----------+-----+-----+------+
only showing top 3 rows

