In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, when, round, floor, ceil, lower, upper

In [2]:
spark = SparkSession.builder. \
appName("DF Practice"). \
getOrCreate()

In [3]:
df = spark.read.csv("./dataset/product.csv", header=True, inferSchema=True)

In [4]:
df.count()

5

In [5]:
df.show(5)

+----------+-------------------+-----------------+-------------+--------+
|product_id|product_category_id|     product_name|product_price|quantity|
+----------+-------------------+-----------------+-------------+--------+
|         1|                  2|  Quest Q64 10 FT|        59.98|       1|
|         2|                  2|  Nike Men Finger|       129.99|       5|
|         3|                  2|       Nike Adult|        89.99|       2|
|         4|                  2| Under Armour Men|        89.99|      10|
|         5|                  2|Under Armour Kids|       199.99|       1|
+----------+-------------------+-----------------+-------------+--------+



### Thêm cột total_price

In [6]:
df.withColumn("total_price", col("product_price") * col("quantity")).show()

+----------+-------------------+-----------------+-------------+--------+-----------+
|product_id|product_category_id|     product_name|product_price|quantity|total_price|
+----------+-------------------+-----------------+-------------+--------+-----------+
|         1|                  2|  Quest Q64 10 FT|        59.98|       1|      59.98|
|         2|                  2|  Nike Men Finger|       129.99|       5|     649.95|
|         3|                  2|       Nike Adult|        89.99|       2|     179.98|
|         4|                  2| Under Armour Men|        89.99|      10|      899.9|
|         5|                  2|Under Armour Kids|       199.99|       1|     199.99|
+----------+-------------------+-----------------+-------------+--------+-----------+



In [7]:
df.select("*", expr("product_price * quantity as total_price")).show()

+----------+-------------------+-----------------+-------------+--------+-----------+
|product_id|product_category_id|     product_name|product_price|quantity|total_price|
+----------+-------------------+-----------------+-------------+--------+-----------+
|         1|                  2|  Quest Q64 10 FT|        59.98|       1|      59.98|
|         2|                  2|  Nike Men Finger|       129.99|       5|     649.95|
|         3|                  2|       Nike Adult|        89.99|       2|     179.98|
|         4|                  2| Under Armour Men|        89.99|      10|      899.9|
|         5|                  2|Under Armour Kids|       199.99|       1|     199.99|
+----------+-------------------+-----------------+-------------+--------+-----------+



In [8]:
df.selectExpr("*", "product_price * quantity as total_price").show()

+----------+-------------------+-----------------+-------------+--------+-----------+
|product_id|product_category_id|     product_name|product_price|quantity|total_price|
+----------+-------------------+-----------------+-------------+--------+-----------+
|         1|                  2|  Quest Q64 10 FT|        59.98|       1|      59.98|
|         2|                  2|  Nike Men Finger|       129.99|       5|     649.95|
|         3|                  2|       Nike Adult|        89.99|       2|     179.98|
|         4|                  2| Under Armour Men|        89.99|      10|      899.9|
|         5|                  2|Under Armour Kids|       199.99|       1|     199.99|
+----------+-------------------+-----------------+-------------+--------+-----------+



### Sửa giá trị trong cột product_price

In [9]:
df.withColumn("product_price",
              when(lower(col("product_name")).contains("Nike"), round(col("product_price") * 1.2, 2))
              .when(lower(col("product_name")).contains("Armour"), round(col("product_price") * 1.5, 2))
              .otherwise(col("product_price"))
              ).show()

+----------+-------------------+-----------------+-------------+--------+
|product_id|product_category_id|     product_name|product_price|quantity|
+----------+-------------------+-----------------+-------------+--------+
|         1|                  2|  Quest Q64 10 FT|        59.98|       1|
|         2|                  2|  Nike Men Finger|       129.99|       5|
|         3|                  2|       Nike Adult|        89.99|       2|
|         4|                  2| Under Armour Men|        89.99|      10|
|         5|                  2|Under Armour Kids|       199.99|       1|
+----------+-------------------+-----------------+-------------+--------+



In [10]:
df.selectExpr("*", 
              """
            ROUND(
            CASE
            WHEN LOWER(product_name) LIKE '%Nike%' THEN product_price * 1.2
            WHEN LOWER(product_name) LIKE '%Armour%' THEN product_price * 1.5 
            ELSE product_price
            END, 2) AS new_price             
""").drop("product_price").withColumnRenamed("new_price", "product_price").show()

+----------+-------------------+-----------------+--------+-------------+
|product_id|product_category_id|     product_name|quantity|product_price|
+----------+-------------------+-----------------+--------+-------------+
|         1|                  2|  Quest Q64 10 FT|       1|        59.98|
|         2|                  2|  Nike Men Finger|       5|       129.99|
|         3|                  2|       Nike Adult|       2|        89.99|
|         4|                  2| Under Armour Men|      10|        89.99|
|         5|                  2|Under Armour Kids|       1|       199.99|
+----------+-------------------+-----------------+--------+-------------+

