In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [2]:
# builder pattern
# reader -> format -> header -> schema -> filepath

In [3]:
orders_df = spark.read \
.format("csv") \
.option("header","true") \
.option("inferSchema", "true") \
.load("/user/itv019463/TrendyTechBigData/Week5/data/ordersWithHeader.csv")

In [4]:
# Gives 20 lines
orders_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [5]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [6]:
df1 = orders_df.withColumnRenamed("order_status", "status")

In [7]:
df1.show(5)

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|         status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
+--------+--------------------+-----------+---------------+
only showing top 5 rows



In [8]:
from pyspark.sql.functions import *

In [9]:
df2 = df1.withColumn("order_date_new", to_timestamp("order_date"))

In [10]:
df2.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- order_date_new: timestamp (nullable = true)



In [11]:
df2.show(5)

+--------+--------------------+-----------+---------------+-------------------+
|order_id|          order_date|customer_id|         status|     order_date_new|
+--------+--------------------+-----------+---------------+-------------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|2013-07-25 00:00:00|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|2013-07-25 00:00:00|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|2013-07-25 00:00:00|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|2013-07-25 00:00:00|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|2013-07-25 00:00:00|
+--------+--------------------+-----------+---------------+-------------------+
only showing top 5 rows



In [12]:
spark.stop()