In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession \
    .builder \
    .getOrCreate()

sc = spark.sparkContext

Create Dataframe

In [18]:
df_ord = spark.read.csv("/home/phillipefs/spark_dev/pyspark-end-to-end-developer/0 - PracticeFiles/Orders", schema="order_id INT, order_date STRING, order_customer_id INT, order_status STRING")
data=(('Robert',35,40,40),('Robert',35,40,40),('Ram',31,33,29),('Ram',31,33,91))
df_emp = spark.createDataFrame(data=data,schema=('name','score1','score2','score3'))

#### select(*cols)
- select one or more columns
- Can apply necessary functions on the selected columns

In [24]:
df_ord.select('order_status',lower('order_status').alias('order_status_lower')).show(2)

+---------------+------------------+
|   order_status|order_status_lower|
+---------------+------------------+
|         CLOSED|            closed|
|PENDING_PAYMENT|   pending_payment|
+---------------+------------------+
only showing top 2 rows



#### selectExpr(*expr)
- This is a variant of select that accepts SQL expressions
- if we want to use any functions available in SQL bot not in spark built-in functions, then we can use selectExpr

In [30]:
df_ord.selectExpr('order_date','substring(order_date, 1, 4) as year').show(1, truncate=False)

+---------------------+----+
|order_date           |year|
+---------------------+----+
|2013-07-25 00:00:00.0|2013|
+---------------------+----+
only showing top 1 row



In [37]:
df = spark.range(1)
df.selectExpr("stack(3, 1, 2, 3, 4, 5, 6)").show()

+----+----+
|col0|col1|
+----+----+
|   1|   2|
|   3|   4|
|   5|   6|
+----+----+



#### withColumn(colName, col)
- Applied transformation to only selected columns. 
- The first argument is a alias name. If we give a alias name same as column name, the transformation will apply on the same column 


In [39]:
df_ord.withColumn("year", substring("order_date", 1, 4).cast('int')).show(1)

+--------+--------------------+-----------------+------------+----+
|order_id|          order_date|order_customer_id|order_status|year|
+--------+--------------------+-----------------+------------+----+
|       1|2013-07-25 00:00:...|            11599|      CLOSED|2013|
+--------+--------------------+-----------------+------------+----+
only showing top 1 row



#### withColumnRenamed(existingCol, newCol)

In [41]:
df_ord.withColumnRenamed("order_customer_id", "customer_id").show(1)

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|       1|2013-07-25 00:00:...|      11599|      CLOSED|
+--------+--------------------+-----------+------------+
only showing top 1 row



#### drop(*cols)
- drop a column

In [42]:
df_ord.drop("order_status", "order_date").show(1)

+--------+-----------------+
|order_id|order_customer_id|
+--------+-----------------+
|       1|            11599|
+--------+-----------------+
only showing top 1 row



#### dropDuplicates(subset=None)
- Drop duplicates rows
- optionaly can consider only subset of columns.

In [43]:
df_emp.show()

+------+------+------+------+
|  name|score1|score2|score3|
+------+------+------+------+
|Robert|    35|    40|    40|
|Robert|    35|    40|    40|
|   Ram|    31|    33|    29|
|   Ram|    31|    33|    91|
+------+------+------+------+



In [46]:
df_emp.dropDuplicates().show()

+------+------+------+------+
|  name|score1|score2|score3|
+------+------+------+------+
|Robert|    35|    40|    40|
|   Ram|    31|    33|    29|
|   Ram|    31|    33|    91|
+------+------+------+------+



In [47]:
df_emp.dropDuplicates(subset=['name','score1','score2']).show()

+------+------+------+------+
|  name|score1|score2|score3|
+------+------+------+------+
|Robert|    35|    40|    40|
|   Ram|    31|    33|    29|
+------+------+------+------+

