In [108]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit,from_json,get_json_object
from pyspark.sql.types import StructType,StructField, StringType,IntegerType


spark = SparkSession.builder.appName("q1 answer").getOrCreate()

In [109]:
spark

In [110]:
schema = StructType([ \
    StructField("OrderId",StringType(),True), \
    StructField("OrderItemId",StringType(),True), \
    StructField("QuantityOrdered",IntegerType(),True), \
    StructField("ItemPrice", IntegerType(), True), \
    StructField("PromotionDiscount", StringType(), True), \
    StructField("batch_id", IntegerType(), True) \
  ])
 

In [117]:
df1 = spark.read.options(header=True, quote = '"').schema(schema).csv("order_region_a.csv")

In [118]:
df2 = spark.read.options(header=True).schema(schema).csv("order_region_b.csv")

In [119]:
df1 = df1.withColumn("region",lit("A"))
df2 = df2.withColumn("region",lit("B"))


In [121]:
df1.show(truncate=False)
df1.count()

+-------------------+-----------+---------------+---------+----------------------------+--------+------+
|OrderId            |OrderItemId|QuantityOrdered|ItemPrice|PromotionDiscount           |batch_id|region|
+-------------------+-----------+---------------+---------+----------------------------+--------+------+
|171-0001135-1657958|1.11689E+13|1              |949      |"{ ""CurrencyCode"": ""INR""|null    |A     |
|171-0001497-9165123|1.97603E+13|1              |699      |"{ ""CurrencyCode"": ""INR""|null    |A     |
|171-0002127-1363507|5.94976E+12|1              |399      |"{ ""CurrencyCode"": ""INR""|null    |A     |
|171-0002370-0601169|5.75719E+13|1              |499      |"{ ""CurrencyCode"": ""INR""|null    |A     |
|171-0004526-2028348|3.38513E+13|1              |1699     |"{ ""CurrencyCode"": ""INR""|null    |A     |
|171-0004781-3853173|4.36861E+13|1              |399      |"{ ""CurrencyCode"": ""INR""|null    |A     |
|171-0004947-4305927|1.59414E+13|1              |1399  

44494

In [115]:
df2.show(truncate=False)
df2.count()

+-------------------+-----------+---------------+---------+----------------------------+--------+------+
|OrderId            |OrderItemId|QuantityOrdered|ItemPrice|PromotionDiscount           |batch_id|region|
+-------------------+-----------+---------------+---------+----------------------------+--------+------+
|171-0001135-1657958|1.11689E+13|1              |949      |"{ ""CurrencyCode"": ""INR""|null    |B     |
|171-0001497-9165123|1.97603E+13|1              |699      |"{ ""CurrencyCode"": ""INR""|null    |B     |
|171-0002127-1363507|5.94976E+12|1              |399      |"{ ""CurrencyCode"": ""INR""|null    |B     |
|171-0002370-0601169|5.75719E+13|1              |499      |"{ ""CurrencyCode"": ""INR""|null    |B     |
|171-0004526-2028348|3.38513E+13|1              |1699     |"{ ""CurrencyCode"": ""INR""|null    |B     |
|171-0004781-3853173|4.36861E+13|1              |399      |"{ ""CurrencyCode"": ""INR""|null    |B     |
|171-0004947-4305927|1.59414E+13|1              |1399  

44494

In [69]:
df = df1.union(df2)

In [70]:
df.count()

88988

In [71]:
df = df.drop_duplicates(subset=['OrderId'])

In [72]:
df.count()

41107

In [73]:
df = df.withColumn("total_sales",df["QuantityOrdered"] * df["ItemPrice"])

In [None]:
df = df.filter(df["total_sales"] >= 0)

In [76]:
promoSchema = StructType([ \
    StructField("CurrencyCode",StringType(),True), \
    StructField("Amount",StringType(),True)
  ])

In [77]:
df = df.withColumn("discount",from_json(df["PromotionDiscount"],promoSchema))

In [78]:
df.show()

+-------------------+-----------+---------------+---------+--------------------+--------------------+------+-----------+------------+
|            OrderId|OrderItemId|QuantityOrdered|ItemPrice|   PromotionDiscount|            batch_id|region|total_sales|    discount|
+-------------------+-----------+---------------+---------+--------------------+--------------------+------+-----------+------------+
|171-0006030-2254725|3.14562E+13|              1|      499|"{ ""CurrencyCode...| ""Amount"": ""10...|     A|      499.0|{null, null}|
|171-0008037-3788355|2.24569E+13|              1|      499|"{ ""CurrencyCode...| ""Amount"": ""10...|     A|      499.0|{null, null}|
|171-0008662-0057942|6.63267E+13|              1|      599|"{ ""CurrencyCode...| ""Amount"": ""10...|     A|      599.0|{null, null}|
|171-0010803-5365973|2.11352E+13|              1|      299|"{ ""CurrencyCode...| ""Amount"": ""10...|     A|      299.0|{null, null}|
|171-0015668-5065935|8.91807E+12|              1|      699|"{ 

In [None]:
df = df.withColumn("net_sale",df["total_sales"] - df["PromotionDiscount"])

In [None]:
df.show(truncate=False)