In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [2]:
orders_df = spark.read \
.format("csv") \
.option("header","true") \
.option("inferSchema", "true") \
.load("/user/itv019463/TrendyTechBigData/Week5/data/ordersWithHeader.csv")

In [3]:
orders_df.show(5)

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
+--------+--------------------+-----------+---------------+
only showing top 5 rows



In [4]:
filtered_df = orders_df.where("customer_id = 11599")

In [5]:
filtered_df.show()

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|       1|2013-07-25 00:00:...|      11599|      CLOSED|
|   11397|2013-10-03 00:00:...|      11599|    COMPLETE|
|   23908|2013-12-20 00:00:...|      11599|    COMPLETE|
|   53545|2014-06-27 00:00:...|      11599|     PENDING|
|   59911|2013-10-17 00:00:...|      11599|  PROCESSING|
+--------+--------------------+-----------+------------+



In [6]:
filtered_df.show(truncate = False)

+--------+---------------------+-----------+------------+
|order_id|order_date           |customer_id|order_status|
+--------+---------------------+-----------+------------+
|1       |2013-07-25 00:00:00.0|11599      |CLOSED      |
|11397   |2013-10-03 00:00:00.0|11599      |COMPLETE    |
|23908   |2013-12-20 00:00:00.0|11599      |COMPLETE    |
|53545   |2014-06-27 00:00:00.0|11599      |PENDING     |
|59911   |2013-10-17 00:00:00.0|11599      |PROCESSING  |
+--------+---------------------+-----------+------------+



In [7]:
filtered_df = orders_df.filter("customer_id = 11599")

In [8]:
filtered_df.show(truncate = False)

+--------+---------------------+-----------+------------+
|order_id|order_date           |customer_id|order_status|
+--------+---------------------+-----------+------------+
|1       |2013-07-25 00:00:00.0|11599      |CLOSED      |
|11397   |2013-10-03 00:00:00.0|11599      |COMPLETE    |
|23908   |2013-12-20 00:00:00.0|11599      |COMPLETE    |
|53545   |2014-06-27 00:00:00.0|11599      |PENDING     |
|59911   |2013-10-17 00:00:00.0|11599      |PROCESSING  |
+--------+---------------------+-----------+------------+



In [4]:
orders_df_2 = spark.read.csv("/user/itv019463/data/ordersWithHeader.csv", header = "true", inferSchema = "true")

In [5]:
orders_df_2.show(5)

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
+--------+--------------------+-----------+---------------+
only showing top 5 rows



In [6]:
orders_df_json = spark.read.json("/public/trendytech/datasets/orders.json")

In [7]:
orders_df_json.show(5)

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
|      11318|2013-07-25 00:00:...|       5|       COMPLETE|
+-----------+--------------------+--------+---------------+
only showing top 5 rows



In [9]:
orders_df_json.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [10]:
orders_df_parquet = spark.read.parquet("/public/trendytech/datasets/ordersparquet/*")

In [11]:
orders_df_parquet.show(5)

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
|      11318|2013-07-25 00:00:...|       5|       COMPLETE|
+-----------+--------------------+--------+---------------+
only showing top 5 rows



In [12]:
orders_df_parquet.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [14]:
orders_df_orc = spark.read.orc("/public/trendytech/datasets/ordersorc")

In [15]:
orders_df_orc.show(5)

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
|      11318|2013-07-25 00:00:...|       5|       COMPLETE|
+-----------+--------------------+--------+---------------+
only showing top 5 rows



In [16]:
orders_df_orc.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_status: string (nullable = true)

