In [9]:
from pyspark.sql import SparkSession
from pyspark.sql import functions

In [10]:
spark = SparkSession.builder \
    .appName("DF Basics") \
    .getOrCreate()

spark

In [11]:
df = spark.read \
.format("csv") \
.option("header", "false") \
.option("inferSchema", "true") \
.load("./dataset/orders_sh.csv")

In [12]:
df

DataFrame[_c0: int, _c1: timestamp, _c2: int, _c3: string]

In [13]:
headers = ["order_id", "order_date", "cust_id", "order_status"]
df = df.toDF(*headers)
df

DataFrame[order_id: int, order_date: timestamp, cust_id: int, order_status: string]

In [14]:
df.show(5)

+--------+-------------------+-------+---------------+
|order_id|         order_date|cust_id|   order_status|
+--------+-------------------+-------+---------------+
|       1|2013-07-25 00:00:00|  11599|         CLOSED|
|       2|2013-07-25 00:00:00|    256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|  12111|       COMPLETE|
|       4|2013-07-25 00:00:00|   8827|         CLOSED|
|       5|2013-07-25 00:00:00|  11318|       COMPLETE|
+--------+-------------------+-------+---------------+
only showing top 5 rows



In [15]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- cust_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



### Add a new column

In [16]:
df_newCol = df.withColumn("cust_id_String", functions.col("cust_id").cast("string"))
df_newCol.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- cust_id: integer (nullable = true)
 |-- order_status: string (nullable = true)
 |-- cust_id_String: string (nullable = true)



### Rename a column

In [17]:
df_renamedCol = df_newCol.withColumnRenamed("cust_id_String", "customer_id")
df_renamedCol.show(5)

+--------+-------------------+-------+---------------+-----------+
|order_id|         order_date|cust_id|   order_status|customer_id|
+--------+-------------------+-------+---------------+-----------+
|       1|2013-07-25 00:00:00|  11599|         CLOSED|      11599|
|       2|2013-07-25 00:00:00|    256|PENDING_PAYMENT|        256|
|       3|2013-07-25 00:00:00|  12111|       COMPLETE|      12111|
|       4|2013-07-25 00:00:00|   8827|         CLOSED|       8827|
|       5|2013-07-25 00:00:00|  11318|       COMPLETE|      11318|
+--------+-------------------+-------+---------------+-----------+
only showing top 5 rows



### Select Columns or Drop Columns

In [18]:
df_select = df_renamedCol.select("order_id", "order_status")
df_select.show(5)

+--------+---------------+
|order_id|   order_status|
+--------+---------------+
|       1|         CLOSED|
|       2|PENDING_PAYMENT|
|       3|       COMPLETE|
|       4|         CLOSED|
|       5|       COMPLETE|
+--------+---------------+
only showing top 5 rows



In [19]:
df_drop = df_renamedCol.drop("order_date", "cust_id", "customer_id")
df_drop.show(5)

+--------+---------------+
|order_id|   order_status|
+--------+---------------+
|       1|         CLOSED|
|       2|PENDING_PAYMENT|
|       3|       COMPLETE|
|       4|         CLOSED|
|       5|       COMPLETE|
+--------+---------------+
only showing top 5 rows



### Write DF to file .csv

In [20]:
directory_path = "./dataset/write_CSV"
df_select.write.csv(directory_path, mode="overwrite", header=True)