
# Staging Zone

read data from staging tables

In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, LongType, DateType, IntegerType
import pyspark.sql.functions as F

spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

In [0]:
%sql

-- setup default database to staging_assignment

USE staging_assignment;
SHOW Tables;

database,tableName,isTemporary
staging_assignment,customers,False
staging_assignment,orders,False



# Curated Zone

In [0]:
%sql 

-- setup curated database

DROP DATABASE IF EXISTS curated_assignment CASCADE;

CREATE DATABASE curated_assignment; 


## Read Staging Tables

In [0]:
# Read staging tables to spark dataframe

customers_df = spark.read.table("staging_assignment.customers")
customers_df.cache()
orders_df = spark.read.table("staging_assignment.orders")
orders_df.cache()

Out[8]: DataFrame[Order_Id: int, Order_Date: date, Status: string, Item_Id: int, Qty_Ordered: double, Price: double, Value: double, Discount_Amount: double, Total: double, Category: string, Payment_Method: string, Cust_Id: int, Year: int, Month: string]

In [0]:
display(customers_df.head(3))

City,County,Customer_Since,E_Mail,Gender,Place_Name,Region,State,Zip,Age,Cust_Id,Full_Name
Eagletown,McCurtain,2016-10-12,michelle.stabler@yahoo.com,F,Eagletown,South,OK,74734,34,87387,"Stabler, Michelle"
Belmont,Sabine,2016-05-17,peter.perreira@ntlworld.com,M,Belmont,South,LA,71406,62,87393,"Perreira, Peter"
Nelsonia,Accomack,2016-10-29,miss.dale@ibm.com,F,Nelsonia,South,VA,23414,69,87404,"Dale, Miss"


In [0]:
display(orders_df.head(3))

Order_Id,Order_Date,Status,Item_Id,Qty_Ordered,Price,Value,Discount_Amount,Total,Category,Payment_Method,Cust_Id,Year,Month
100496528,2021-04-29,canceled,804885,3.0,50.0,100.0,0.0,100.0,Others,Payaxis,57890,2021,Apr-2021
100464507,2021-03-29,canceled,762085,3.0,600.0,1200.0,0.0,1200.0,Others,Payaxis,91123,2021,Mar-2021
100464508,2021-03-29,canceled,762086,2.0,400.0,400.0,0.0,400.0,Others,Payaxis,91123,2021,Mar-2021



## Define Joint Dataframe

In [0]:
#create a joint table with order and cumstomer data and define new segment column

order_customer_df = orders_df.join(customers_df, on = ["Cust_Id"], how = 'left' ) \
                            .withColumn("Customer_Segment", F.when(F.col("Age")<=20,"young")
                                        .when(((F.col("Age")>20) & (F.col("Age")<=35)),"adults")
                                        .when(((F.col("Age")>35) & (F.col("Age")<=55)),"middle-ages")
                                        .when((F.col("Age")>55),"old"))

In [0]:
display(order_customer_df.head(30))

Cust_Id,Order_Id,Order_Date,Status,Item_Id,Qty_Ordered,Price,Value,Discount_Amount,Total,Category,Payment_Method,Year,Month,City,County,Customer_Since,E_Mail,Gender,Place_Name,Region,State,Zip,Age,Full_Name,Customer_Segment
57890,100496528,2021-04-29,canceled,804885,3.0,50.0,100.0,0.0,100.0,Others,Payaxis,2021,Apr-2021,Port Heiden,Lake and Peninsula,2017-01-09,debroah.beem@gmail.com,F,Port Heiden,West,AK,99549,40,"Beem, Debroah",middle-ages
91123,100464507,2021-03-29,canceled,762085,3.0,600.0,1200.0,0.0,1200.0,Others,Payaxis,2021,Mar-2021,Hueysville,Floyd,2004-06-20,sydney.winsor@hotmail.com,M,Hueysville,South,KY,41640,64,"Winsor, Sydney",old
91123,100464508,2021-03-29,canceled,762086,2.0,400.0,400.0,0.0,400.0,Others,Payaxis,2021,Mar-2021,Hueysville,Floyd,2004-06-20,sydney.winsor@hotmail.com,M,Hueysville,South,KY,41640,64,"Winsor, Sydney",old
91129,100502506,2021-04-30,canceled,811088,5.0,50.0,200.0,0.0,200.0,Others,Easypay_MA,2021,Apr-2021,Lincoln,Alcona,2017-06-08,veta.shockey@msn.com,F,Lincoln,Midwest,MI,48742,45,"Shockey, Veta",middle-ages
91141,100464565,2021-03-29,canceled,762176,2.0,600.0,600.0,0.0,600.0,Others,Payaxis,2021,Mar-2021,Topeka,Shawnee,2004-01-26,jonie.yetter@yahoo.com,F,Topeka,Midwest,KS,66610,71,"Yetter, Jonie",old
91146,100464576,2021-03-29,canceled,762201,2.0,400.0,400.0,0.0,400.0,Others,easypay_voucher,2021,Mar-2021,Voca,McCulloch,2010-02-28,buford.sternberg@sbcglobal.net,M,Voca,South,TX,76887,22,"Sternberg, Buford",adults
91177,100464659,2021-03-30,canceled,762338,5.0,600.0,2400.0,0.0,2400.0,Others,easypay_voucher,2021,Mar-2021,Tacoma,Pierce,2016-09-09,randall.briner@rediffmail.com,M,Tacoma,West,WA,98442,74,"Briner, Randall",old
91184,100464689,2021-03-30,complete,762376,1.0,600.0,0.0,0.0,0.0,Others,Payaxis,2021,Mar-2021,North Waterford,Oxford,2008-12-10,terrell.nordquist@bellsouth.net,M,North Waterford,Northeast,ME,4267,42,"Nordquist, Terrell",middle-ages
91184,100472406,2021-04-07,complete,773660,1.0,600.0,0.0,0.0,0.0,Others,Payaxis,2021,Apr-2021,North Waterford,Oxford,2008-12-10,terrell.nordquist@bellsouth.net,M,North Waterford,Northeast,ME,4267,42,"Nordquist, Terrell",middle-ages
91186,100478999,2021-04-17,canceled,783346,6.0,100.0,500.0,0.0,500.0,Others,Payaxis,2021,Apr-2021,Boutte,St. Charles,2006-05-19,billy.shute@hotmail.com,M,Boutte,South,LA,70039,19,"Shute, Billy",young


In [0]:
display(order_customer_df.groupby(F.col("Order_Id"),F.col("Status")).count())

Order_Id,Status,count
100464875,canceled,1
100469210,canceled,2
100481367,canceled,1
100500507,canceled,1
100473167,canceled,2
100472971,complete,1
100473437,complete,1
100474054,canceled,1
100496105,canceled,1
100484518,canceled,1


In [0]:
display(order_customer_df.where(F.col("Order_Id")==100555558))

Cust_Id,Order_Id,Order_Date,Status,Item_Id,Qty_Ordered,Price,Value,Discount_Amount,Total,Category,Payment_Method,Year,Month,City,County,Customer_Since,E_Mail,Gender,Place_Name,Region,State,Zip,Age,Full_Name,Customer_Segment
110568,100555558,2021-09-08,canceled,893733,2.0,29.9,29.9,0.0,29.9,Mobiles & Tablets,bankalfalah,2021,Sep-2021,East Greenbush,Rensselaer,2016-08-11,gregory.ludlow@aol.com,M,East Greenbush,Northeast,NY,12061,51,"Ludlow, Gregory",middle-ages
110568,100555558,2021-09-08,canceled,893732,2.0,12.9,12.9,0.0,12.9,Women's Fashion,bankalfalah,2021,Sep-2021,East Greenbush,Rensselaer,2016-08-11,gregory.ludlow@aol.com,M,East Greenbush,Northeast,NY,12061,51,"Ludlow, Gregory",middle-ages
110568,100555558,2021-09-08,canceled,893713,6.0,3.0,15.0,0.0,15.0,Superstore,bankalfalah,2021,Sep-2021,East Greenbush,Rensselaer,2016-08-11,gregory.ludlow@aol.com,M,East Greenbush,Northeast,NY,12061,51,"Ludlow, Gregory",middle-ages
110568,100555558,2021-09-08,canceled,893714,2.0,57.0,57.0,0.0,57.0,Superstore,bankalfalah,2021,Sep-2021,East Greenbush,Rensselaer,2016-08-11,gregory.ludlow@aol.com,M,East Greenbush,Northeast,NY,12061,51,"Ludlow, Gregory",middle-ages
110568,100555558,2021-09-08,canceled,893715,4.0,5.2,15.6,0.0,15.6,Superstore,bankalfalah,2021,Sep-2021,East Greenbush,Rensselaer,2016-08-11,gregory.ludlow@aol.com,M,East Greenbush,Northeast,NY,12061,51,"Ludlow, Gregory",middle-ages
110568,100555558,2021-09-08,canceled,893717,2.0,7.6,7.6,0.0,7.6,Superstore,bankalfalah,2021,Sep-2021,East Greenbush,Rensselaer,2016-08-11,gregory.ludlow@aol.com,M,East Greenbush,Northeast,NY,12061,51,"Ludlow, Gregory",middle-ages
110568,100555558,2021-09-08,canceled,893719,2.0,28.7,28.7,0.0,28.7,Superstore,bankalfalah,2021,Sep-2021,East Greenbush,Rensselaer,2016-08-11,gregory.ludlow@aol.com,M,East Greenbush,Northeast,NY,12061,51,"Ludlow, Gregory",middle-ages
110568,100555558,2021-09-08,canceled,893720,2.0,11.1,11.1,0.0,11.1,Superstore,bankalfalah,2021,Sep-2021,East Greenbush,Rensselaer,2016-08-11,gregory.ludlow@aol.com,M,East Greenbush,Northeast,NY,12061,51,"Ludlow, Gregory",middle-ages
110568,100555558,2021-09-08,canceled,893721,3.0,4.8,9.6,0.0,9.6,Superstore,bankalfalah,2021,Sep-2021,East Greenbush,Rensselaer,2016-08-11,gregory.ludlow@aol.com,M,East Greenbush,Northeast,NY,12061,51,"Ludlow, Gregory",middle-ages
110568,100555558,2021-09-08,canceled,893722,2.0,18.3,18.3,0.0,18.3,Superstore,bankalfalah,2021,Sep-2021,East Greenbush,Rensselaer,2016-08-11,gregory.ludlow@aol.com,M,East Greenbush,Northeast,NY,12061,51,"Ludlow, Gregory",middle-ages


In [0]:
order_customer_df.columns

Out[15]: ['Cust_Id',
 'Order_Id',
 'Order_Date',
 'Status',
 'Item_Id',
 'Qty_Ordered',
 'Price',
 'Value',
 'Discount_Amount',
 'Total',
 'Category',
 'Payment_Method',
 'Year',
 'Month',
 'City',
 'County',
 'Customer_Since',
 'E_Mail',
 'Gender',
 'Place_Name',
 'Region',
 'State',
 'Zip',
 'Age',
 'Full_Name',
 'Customer_Segment']


## Revenue Pipeline

In [0]:
# Define list of columns to be used for revenue aggregations

revenue_columns =  ['Status',
                    'Category',
                    'Payment_Method',
                    'Year',
                    'Month',
                    'City',
                    'County',
                    'Gender',
                    'Region',
                    'State',
                    'Customer_Segment']

In [0]:
# Create revenue dataframe

revenue_agg_df = order_customer_df.groupby(revenue_columns).agg(F.sum("Total").alias("Revenue"))

In [0]:
display(revenue_agg_df.head(20))

Status,Category,Payment_Method,Year,Month,City,County,Gender,Region,State,Customer_Segment,Revenue
canceled,Others,Easypay,2021,Apr-2021,Gaines,Genesee,F,Midwest,MI,old,800.0
complete,Others,Easypay,2021,Apr-2021,Atlantic,Carteret,M,South,NC,middle-ages,360.0
canceled,Others,Payaxis,2021,Apr-2021,Baltimore,Baltimore (city),F,South,MD,middle-ages,0.0
canceled,Others,jazzwallet,2021,Apr-2021,Orange,Orange,F,West,CA,middle-ages,100.0
canceled,Others,Payaxis,2021,Apr-2021,York New Salem,York,F,Northeast,PA,old,600.0
received,Others,Payaxis,2021,Apr-2021,Sailor Springs,Clay,M,Midwest,IL,adults,5200.0
canceled,Others,Easypay,2021,Apr-2021,West Willow,Lancaster,M,Northeast,PA,middle-ages,360.0
canceled,Others,Easypay,2021,Apr-2021,Mamaroneck,Westchester,M,Northeast,NY,old,4000.0
canceled,Others,Payaxis,2021,Apr-2021,Rochester,Monroe,M,Northeast,NY,adults,0.0
canceled,Others,Payaxis,2021,Apr-2021,Lyon,Coahoma,M,South,MS,adults,400.0


In [0]:
# Save revenue data to curated database

revenue_agg_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").partitionBy("Category").saveAsTable("curated_assignment.revenue_agg")

In [0]:
%sql

USE curated_assignment;
SHOW TABLES;

database,tableName,isTemporary
curated_assignment,customer_agg,False
curated_assignment,revenue_agg,False



## Customer Pipeline

In [0]:
# Define list of columns to be used for customer aggregations

customer_columns =  ['Cust_Id',
                    'Full_Name',
                    'Status',
                    'Category',
                    'Payment_Method',
                    'Year',
                    'Month',
                    'City',
                    'County',
                    'Gender',
                    'Region',
                    'State',
                    'Customer_Segment']

In [0]:
# Create customer agg dataframe

customer_agg_df = order_customer_df.groupby(customer_columns).agg(F.sum("Total").alias("Revenue"))

In [0]:
# Save customer agg data to curated database

customer_agg_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").partitionBy("Category").saveAsTable("curated_assignment.customer_agg")