# Staging Zone
Store the datasets in staging zone using appropriate schema, formats and partitions as considered appropriate.

In [0]:
spark

In [0]:
%fs ls /FileStore/tables/customers.json

path,name,size,modificationTime
dbfs:/FileStore/tables/customers.json,customers.json,15815082,1704000459000


In [0]:
custj =  spark.read.json("/FileStore/tables/customers.json")
custj

DataFrame[City: string, County: string, Customer Since: string, E Mail: string, Gender: string, Place Name: string, Region: string, State: string, Zip: bigint, age: double, cust_id: double, full_name: string]

In [0]:
orderc = spark.read.csv("/FileStore/tables/orders.csv", header=True, inferSchema=True)
orderc

DataFrame[order_id: string, order_date: date, status: string, item_id: double, qty_ordered: double, price: double, value: double, discount_amount: double, total: double, category: string, payment_method: string, cust_id: double, year: int, month: string]

In [0]:
joined_df = custj.join(orderc, 'cust_id')

In [0]:
joined_df.show() 

+-------+------+--------+--------------+--------------------+------+----------+------+-----+-----+----+-----------+---------+----------+--------------+--------+-----------+-----+------+-----------------+---------+-----------------+--------------+----+--------+
|cust_id|  City|  County|Customer Since|              E Mail|Gender|Place Name|Region|State|  Zip| age|  full_name| order_id|order_date|        status| item_id|qty_ordered|price| value|  discount_amount|    total|         category|payment_method|year|   month|
+-------+------+--------+--------------+--------------------+------+----------+------+-----+-----+----+-----------+---------+----------+--------------+--------+-----------+-----+------+-----------------+---------+-----------------+--------------+----+--------+
|60124.0|Vinson|  Harmon|     8/22/2006|jani.titus@gmail.com|     F|    Vinson| South|   OK|73571|43.0|Titus, Jani|100354678|2020-10-01|      received|574772.0|       21.0| 89.9|1798.0|              0.0|   1798.0|    

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, LongType, DateType, IntegerType
from pyspark.sql.functions import col

In [0]:
# defining below the schema and assigning it to the dataset

schema = StructType([
    StructField("cust_id", DoubleType(), True),
    StructField("City", StringType(), True),
    StructField("County", StringType(), True),
    StructField("Customer Since", StringType(), True),
    StructField("E Mail", StringType(), True),
    StructField("Gender", StringType(), True),
    StructField("Place Name", StringType(), True),
    StructField("Region", StringType(), True),
    StructField("State", StringType(), True),
    StructField("Zip", LongType(), True),
    StructField("age", DoubleType(), True),
    StructField("full_name", StringType(), True),
    StructField("order_id", StringType(), True),
    StructField("order_date", DateType(), True),
    StructField("status", StringType(), True),
    StructField("item_id", DoubleType(), True),
    StructField("qty_ordered", DoubleType(), True),
    StructField("price", DoubleType(), True),
    StructField("value", DoubleType(), True),
    StructField("discount_amount", DoubleType(), True),
    StructField("total", DoubleType(), True),
    StructField("category", StringType(), True),
    StructField("payment_method", StringType(), True),
    StructField("year", IntegerType(), True),
    StructField("month", StringType(), True)
])


In [0]:
# Apply the schema to joined_df
joined_df = joined_df.select([col(c).cast(schema[c].dataType) for c in schema.fieldNames()])

In [0]:
# Ingest the DataFrame into the landing zone as Parquet
landing_zone_path = "/path/to/landing/zone"
joined_df.write.parquet(landing_zone_path, mode="overwrite")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-1899057920654282>, line 4[0m
[1;32m      1[0m [43mjoined_df[49m[38;5;241;43m.[39;49m[43mwrite[49m[38;5;241;43m.[39;49m[43mpartitionBy[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mtransaction_date[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m[38;5;124;43m"[39;49m[38;5;124;43mcategory[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m[38;5;124;43m"[39;49m[38;5;124;43mRegion[39;49m[38;5;124;43m"[39;49m[43m)[49m[43m [49m[43m\[49m
[1;32m      2[0m [43m               [49m[38;5;241;43m.[39;49m[43mformat[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mparquet[39;49m[38;5;124;43m"[39;49m[43m)[49m[43m [49m[43m\[49m
[1;32m      3[0m [43m               [49m[38;5;241;43m.[39;49m[43mmode[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43m