In [0]:
spark.conf.set("fs.azure.account.key.azuredataengstorage.dfs.core.windows.net",dbutils.secrets.get(scope="accessscope",key="storageaccountaccesskey"))

In [0]:
location = "abfss://azuredataengcontainer@azuredataengstorage.dfs.core.windows.net/Brazil_ECommerce/"

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

raw_schema = StructType([StructField("Order_Id", StringType(), True),
                         StructField("Order_Status", StringType(), True),
                         StructField("Order_Purchase_Timestamp", StringType(), True),
                         StructField("Order_Approved_At", StringType(), True),
                         StructField("Order_Delivered_Carrier_Date", StringType(), True),
                         StructField("Order_Delivered_Customer_Date", StringType(), True),
                         StructField("Order_Estimated_Delivery_Date", StringType(), True),
                         StructField("Shipping_Limit_Date", StringType(), True),
                         StructField("Price", FloatType(), True),
                         StructField("Freight_Value", FloatType(), True),
                         StructField("Order_Item_Quantity", IntegerType(), True),
                         StructField("Product_Id", StringType(), True),
                         StructField("Product_Category", StringType(), True),
                         StructField("Customer_Id", StringType(), True),
                         StructField("Customer_Unique_Id", StringType(), True),
                         StructField("Customer_Zip_Code", StringType(), True),
                         StructField("Customer_City", StringType(), True),
                         StructField("Customer_State", StringType(), True),
                         StructField("Seller_Id", StringType(), True),
                         StructField("Seller_Zip_Code", StringType(), True),
                         StructField("Seller_City", StringType(), True),
                         StructField("Seller_State", StringType(), True),
                         StructField("Review_Id", StringType(), True),
                         StructField("Review_Score", IntegerType(), True)
                         ])

In [0]:
df = spark.read.option("mode", "DROPMALFORMED").option("header", True).schema(raw_schema).csv(location+"Raw/brazil_ecommerce_raw.csv")
df.distinct()
df = df.filter(df["Order_Status"] == "delivered")

In [0]:
from pyspark.sql.functions import to_date,datediff,col

df_dates_modified = df.withColumn("Order_Purchased_Dates", to_date("Order_Purchase_Timestamp", "dd-MM-yyy"))\
    .withColumn("Order_Delivered_Customer_Dates", to_date("Order_Delivered_Customer_Date", "dd-MM-yyy"))\
    .withColumn("Order_Estimated_Delivery_Dates", to_date("Order_Estimated_Delivery_Date", "dd-MM-yyy"))\
    .drop("Order_Purchase_Timestamp", "Order_Approved_At", "Order_Delivered_Carrier_Date", "Order_Delivered_Customer_Date",
          "Order_Estimated_Delivery_Date", "Shipping_Limit_Date")

In [0]:
df_payments = df_dates_modified.select("*", ((df["Price"]+df["Freight_Value"])*df["Order_Item_Quantity"]).alias("Total_Payment"))

In [0]:
df_payments.coalesce(1).write.format("csv").save(location + "Curated/Temp",header = True)

filenames = dbutils.fs.ls(location + "Curated/Temp")
name = ''

for filename in filenames:
    if filename.name.endswith('.csv'):
        name = filename.name

dbutils.fs.cp(location + "Curated/Temp/" + name, location + "Curated/brazil_e_commerce_curated.csv")
dbutils.fs.rm(location + "Curated/Temp/",recurse = True)

Out[9]: True