In [0]:
df = spark.read.csv('dbfs:/public/retail_db/orders/part-00000', schema='order_id INT, order_date DATE, order_customer_id INT, order_status STRING')

In [0]:
display(df.limit(10))

order_id,order_date,order_customer_id,order_status
1,2013-07-25,11599,CLOSED
2,2013-07-25,256,PENDING_PAYMENT
3,2013-07-25,12111,COMPLETE
4,2013-07-25,8827,CLOSED
5,2013-07-25,11318,COMPLETE
6,2013-07-25,7130,COMPLETE
7,2013-07-25,4530,COMPLETE
8,2013-07-25,2911,PROCESSING
9,2013-07-25,5657,PENDING_PAYMENT
10,2013-07-25,5648,PENDING_PAYMENT


path,name,size,modificationTime
dbfs:/public/retail_db/orders/_SUCCESS,_SUCCESS,0,1695941472313
dbfs:/public/retail_db/orders/_committed_9103897215061897648,_committed_9103897215061897648,288,1695941471769
dbfs:/public/retail_db/orders/_started_9103897215061897648,_started_9103897215061897648,0,1695941468365
dbfs:/public/retail_db/orders/part-00000,part-00000,2999944,1695942912060


In [0]:
sales = [[1000, 12.5], [1200, 10], [750, 20]]

In [0]:
import pandas as pd

In [0]:
sales_df = pd.DataFrame(sales, columns=['sale_amount', 'comission_pct'])

In [0]:
sales_df

Unnamed: 0,sale_amount,comission_pct
0,1000,12.5
1,1200,10.0
2,750,20.0


In [0]:
sales_df.apply(lambda rec: ((rec['sale_amount'] * rec['comission_pct'])/ 100), axis=1)

0    125.0
1    120.0
2    150.0
dtype: float64

In [0]:
df = spark.read.csv('dbfs:/public/retail_db/orders/part-00000', schema='order_id INT, order_date DATE, order_customer_id INT, order_status STRING')

In [0]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [0]:
df.columns

['order_id', 'order_date', 'order_customer_id', 'order_status']

In [0]:
display(df.select('order_date' , 'order_status').distinct().orderBy('order_date', 'order_status').count())

3203

In [0]:
display(df.drop('order_customer_id').limit(10))

order_id,order_date,order_status
1,2013-07-25,CLOSED
2,2013-07-25,PENDING_PAYMENT
3,2013-07-25,COMPLETE
4,2013-07-25,CLOSED
5,2013-07-25,COMPLETE
6,2013-07-25,COMPLETE
7,2013-07-25,COMPLETE
8,2013-07-25,PROCESSING
9,2013-07-25,PENDING_PAYMENT
10,2013-07-25,PENDING_PAYMENT


In [0]:
from pyspark.sql.functions import date_format, cast

In [0]:
display(df.select('order_id', 'order_date', cast('int', date_format('order_date', 'yyyyMM'))).alias('order_month'))

order_id,order_date,"date_format(order_date, yyyyMM)"
1,2013-07-25,201307
2,2013-07-25,201307
3,2013-07-25,201307
4,2013-07-25,201307
5,2013-07-25,201307
6,2013-07-25,201307
7,2013-07-25,201307
8,2013-07-25,201307
9,2013-07-25,201307
10,2013-07-25,201307


In [0]:
display(df.withColumn('order_month', cast('int', date_format('order_date', 'yyyyMM'))).limit(10))

order_id,order_date,order_customer_id,order_status,order_month
1,2013-07-25,11599,CLOSED,201307
2,2013-07-25,256,PENDING_PAYMENT,201307
3,2013-07-25,12111,COMPLETE,201307
4,2013-07-25,8827,CLOSED,201307
5,2013-07-25,11318,COMPLETE,201307
6,2013-07-25,7130,COMPLETE,201307
7,2013-07-25,4530,COMPLETE,201307
8,2013-07-25,2911,PROCESSING,201307
9,2013-07-25,5657,PENDING_PAYMENT,201307
10,2013-07-25,5648,PENDING_PAYMENT,201307


In [0]:
display(df.drop('order_customer_id').withColumn('order_month', cast('int', date_format('order_date', 'yyyyMM'))).limit(10))

order_id,order_date,order_status,order_month
1,2013-07-25,CLOSED,201307
2,2013-07-25,PENDING_PAYMENT,201307
3,2013-07-25,COMPLETE,201307
4,2013-07-25,CLOSED,201307
5,2013-07-25,COMPLETE,201307
6,2013-07-25,COMPLETE,201307
7,2013-07-25,COMPLETE,201307
8,2013-07-25,PROCESSING,201307
9,2013-07-25,PENDING_PAYMENT,201307
10,2013-07-25,PENDING_PAYMENT,201307


In [0]:
df.write.format('delta').save('dbfs:/public/retail_db_delta/orders')

In [0]:
%fs ls dbfs:/public/retail_db_delta/orders

path,name,size,modificationTime
dbfs:/public/retail_db_delta/orders/_delta_log/,_delta_log/,0,1695945375104
dbfs:/public/retail_db_delta/orders/part-00000-0b1d4d44-0165-4eac-9271-026a5c354797-c000.snappy.parquet,part-00000-0b1d4d44-0165-4eac-9271-026a5c354797-c000.snappy.parquet,488399,1695945376269


In [0]:
display(spark.read.format('delta').load('dbfs:/public/retail_db_delta/orders').limit(10))

order_id,order_date,order_customer_id,order_status
1,2013-07-25,11599,CLOSED
2,2013-07-25,256,PENDING_PAYMENT
3,2013-07-25,12111,COMPLETE
4,2013-07-25,8827,CLOSED
5,2013-07-25,11318,COMPLETE
6,2013-07-25,7130,COMPLETE
7,2013-07-25,4530,COMPLETE
8,2013-07-25,2911,PROCESSING
9,2013-07-25,5657,PENDING_PAYMENT
10,2013-07-25,5648,PENDING_PAYMENT
