In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [2]:
spark.sql("show databases").filter("namespace like 'itv019463%'").show()

+----------------+
|       namespace|
+----------------+
|itv019463_retail|
+----------------+



In [3]:
spark.sql("use itv019463_retail")

In [4]:
spark.sql("show tables").show()

+----------------+----------+-----------+
|        database| tableName|isTemporary|
+----------------+----------+-----------+
|itv019463_retail|orders_ext|      false|
+----------------+----------+-----------+



In [5]:
spark.sql("describe extended orders_ext").show(truncate = False)

+----------------------------+-------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                |comment|
+----------------------------+-------------------------------------------------------------------------+-------+
|order_id                    |int                                                                      |null   |
|order_date                  |string                                                                   |null   |
|customer_id                 |int                                                                      |null   |
|order_status                |string                                                                   |null   |
|                            |                                                                         |       |
|# Detailed Table Information|                                                                  

In [6]:
! hadoop fs -ls /user/itv019463/TrendyTechBigData/Week5/data/

Found 1 items
-rw-r--r--   3 itv019463 supergroup    2999989 2025-06-06 07:51 /user/itv019463/TrendyTechBigData/Week5/data/ordersWithHeader.csv


In [7]:
dataframe_from_spark_sql = spark.sql("select * from orders_ext")

In [8]:
dataframe_from_spark_sql.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|    null|          order_date|       null|   order_status|
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       91

In [9]:
dataframe_from_spark_sql_2 = spark.table("itv019463_retail.orders_ext")

In [10]:
dataframe_from_spark_sql_2.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|    null|          order_date|       null|   order_status|
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       91

In [11]:
spark.range(5)

id
0
1
2
3
4


In [12]:
spark.range(0, 8, 2)

id
0
2
4
6


In [13]:
! hadoop fs -cat /public/trendytech/retail_db/orders/part-00000 | head

1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT
10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT
cat: Unable to write to output stream.


In [14]:
orders_list = [(1,'2013-07-25 00:00:00.0',11599,'CLOSED'),(2,'2013-07-25 00:00:00.0',256,'PENDING_PAYMENT'),(3,'2013-07-25 00:00:00.0',12111,'COMPLETE')]

In [15]:
orders_raw_df = spark.createDataFrame(orders_list)

In [16]:
orders_raw_df.show()

+---+--------------------+-----+---------------+
| _1|                  _2|   _3|             _4|
+---+--------------------+-----+---------------+
|  1|2013-07-25 00:00:...|11599|         CLOSED|
|  2|2013-07-25 00:00:...|  256|PENDING_PAYMENT|
|  3|2013-07-25 00:00:...|12111|       COMPLETE|
+---+--------------------+-----+---------------+



In [17]:
orders_raw_df.printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)
 |-- _4: string (nullable = true)



In [18]:
orders_raw_df = spark.createDataFrame(orders_list).toDF('order_id', 'order_date', 'customer_id', 'order_status')

In [19]:
orders_raw_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
+--------+--------------------+-----------+---------------+



In [20]:
orders_raw_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [21]:
orders_schema = ["order_id", "order_date", "cust_id", "order_status"]

In [22]:
df = spark.createDataFrame(orders_list, orders_schema)

In [23]:
df.show()

+--------+--------------------+-------+---------------+
|order_id|          order_date|cust_id|   order_status|
+--------+--------------------+-------+---------------+
|       1|2013-07-25 00:00:...|  11599|         CLOSED|
|       2|2013-07-25 00:00:...|    256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|  12111|       COMPLETE|
+--------+--------------------+-------+---------------+



In [24]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [25]:
orders_schema = 'order_id long, order_date string, cust_id int, order_status string'

In [26]:
df = spark.createDataFrame(orders_list, orders_schema)

In [27]:
df.show()

+--------+--------------------+-------+---------------+
|order_id|          order_date|cust_id|   order_status|
+--------+--------------------+-------+---------------+
|       1|2013-07-25 00:00:...|  11599|         CLOSED|
|       2|2013-07-25 00:00:...|    256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|  12111|       COMPLETE|
+--------+--------------------+-------+---------------+



In [28]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- cust_id: integer (nullable = true)
 |-- order_status: string (nullable = true)

