**Data Integration and Optimization**

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
.appName("e-comm data integration and optimization")\
.getOrCreate()

25/06/13 11:15:17 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
spark

In [4]:
hdfs_path = "/data/olist/"

In [5]:
customers_df = spark.read.csv(hdfs_path + "olist_customers_dataset.csv", header = True, inferSchema= True)

                                                                                

In [6]:
product_category_df = spark.read.csv(hdfs_path + "product_category_name_translation.csv", header = True, inferSchema= True)
geolocation_df = spark.read.csv(hdfs_path + "olist_geolocation_dataset.csv", header = True, inferSchema= True)
order_items_df = spark.read.csv(hdfs_path + "olist_order_items_dataset.csv", header = True, inferSchema= True)
payments_df = spark.read.csv(hdfs_path + "olist_order_payments_dataset.csv", header = True, inferSchema= True)
reviews_df = spark.read.csv(hdfs_path + "olist_order_reviews_dataset.csv", header = True, inferSchema= True)
orders_df = spark.read.csv(hdfs_path + "olist_orders_dataset.csv", header = True, inferSchema= True)
sellers_df = spark.read.csv(hdfs_path + "olist_sellers_dataset.csv", header = True, inferSchema= True)
products_df = spark.read.csv(hdfs_path + "olist_products_dataset.csv", header = True, inferSchema= True)

                                                                                

In [7]:
orders_df.cache()
customers_df.cache()
order_items_df.cache()

DataFrame[order_id: string, order_item_id: int, product_id: string, seller_id: string, shipping_limit_date: timestamp, price: double, freight_value: double]

In [8]:
order_items_joined_df = orders_df.join(order_items_df,"order_id","inner")

In [9]:
order_items_products_df = order_items_joined_df.join(products_df,"product_id","inner") 

In [10]:
orders_items_product_seller_df = order_items_products_df.join(sellers_df,"seller_id","inner")

In [11]:
full_orders_df = orders_items_product_seller_df.join(customers_df,"customer_id","inner")

In [12]:
full_orders_df.show(5)

25/06/13 11:15:45 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 25:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+-------------------+-----+-------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+----------------------+-------------+------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|           seller_id|          product_id|            order_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|order_item_id|shipping_limit_date|price|freight_value|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height

                                                                                

In [13]:
# Geolocation Data

full_orders_df = full_orders_df.join(geolocation_df,full_orders_df.customer_zip_code_prefix == geolocation_df.geolocation_zip_code_prefix,"left")

In [14]:
full_orders_df = full_orders_df.join(reviews_df,"order_id","left")

In [15]:
full_orders_df = full_orders_df.join(payments_df,"order_id","left")

In [16]:
full_orders_df.cache()

DataFrame[order_id: string, customer_id: string, seller_id: string, product_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, order_item_id: int, shipping_limit_date: timestamp, price: double, freight_value: double, product_category_name: string, product_name_lenght: int, product_description_lenght: int, product_photos_qty: int, product_weight_g: int, product_length_cm: int, product_height_cm: int, product_width_cm: int, seller_zip_code_prefix: int, seller_city: string, seller_state: string, customer_unique_id: string, customer_zip_code_prefix: int, customer_city: string, customer_state: string, geolocation_zip_code_prefix: int, geolocation_lat: double, geolocation_lng: double, geolocation_city: string, geolocation_state: string, review_id: string, review_score: string, review_comment_title: string, review_commen

In [17]:
# Total Revenue per seller

from pyspark.sql.functions import *

seller_revenue_df = full_orders_df.groupBy("seller_id").agg(sum("price").alias("Total Revenue"))

In [18]:
seller_revenue_df.show(5)



+--------------------+-------------------+
|           seller_id|      Total Revenue|
+--------------------+-------------------+
|b76dba6c951ab00dc...| 302582.65999999904|
|7a67c85e85bb2ce85...|2.031279489000003E7|
|d2374cbcbb3ca4ab1...|  3375517.550000011|
|c8b0e2b0a7095e5d8...| 1573840.0600000015|
|994f04b3718c2bab3...|  661633.6000000022|
+--------------------+-------------------+
only showing top 5 rows



                                                                                

In [19]:
# Total Orders Per Customer

Total_orders_per_customer = full_orders_df.groupBy("customer_id").agg(count("order_id").alias("total_orders"))



In [20]:
Total_orders_per_customer.show(1)



+--------------------+------------+
|         customer_id|total_orders|
+--------------------+------------+
|f35e5fd801be940cb...|          65|
+--------------------+------------+
only showing top 1 row



                                                                                

In [21]:
# Average Review score per seller

average_review_score_per_seller = full_orders_df.groupBy("seller_id").agg(avg("review_score").alias("seller average score"))


In [22]:
average_review_score_per_seller.show(1)



+--------------------+--------------------+
|           seller_id|seller average score|
+--------------------+--------------------+
|7a67c85e85bb2ce85...|   4.258920734844587|
+--------------------+--------------------+
only showing top 1 row



                                                                                

In [23]:
# top customer by spending

top_customer_by_spending = full_orders_df.groupBy("customer_id").agg(sum("price").alias("Total Sales")).orderBy("Total Sales",ascending= False)



In [24]:
top_customer_by_spending.show(5)



+--------------------+-----------+
|         customer_id|Total Sales|
+--------------------+-----------+
|d3e82ccec3cb5f956...|  6662844.0|
|df55c14d1476a9a34...|  3565657.0|
|fe5113a38e3575c04...|  3293604.0|
|ec5b2ba62e5743423...|  2556120.0|
|63b964e79dee32a35...|  2501664.0|
+--------------------+-----------+
only showing top 5 rows



                                                                                

In [25]:
full_orders_df.show(1)

+--------------------+--------------------+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+-------------------+-----+-------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+----------------------+-----------+------------+--------------------+------------------------+-------------+--------------+---------------------------+-------------------+------------------+----------------+-----------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+------------------+------------+--------------------+-------------+
|            order_id|         customer_id|           seller_id|          product_id|order_status|order_purchase_timestamp|  order_approved_

In [26]:
orders_items_product_seller_df = order_items_products_df.join(broadcast(sellers_df),"seller_id","inner")

In [27]:
#Top 10 Most sold products

top_10_most_sold_products = full_orders_df.groupBy("product_id").agg(count("order_id").alias("Top_10_sold_products")).orderBy("Top_10_sold_products",ascending = False).limit(10)

In [28]:
top_10_most_sold_products.show(10)



+--------------------+--------------------+
|          product_id|Top_10_sold_products|
+--------------------+--------------------+
|aca2eb7d00ea1a7b8...|               86740|
|422879e10f4668299...|               81110|
|99a4788cb24856965...|               78775|
|389d119b48cf3043d...|               60248|
|d1c427060a0f73f6b...|               59274|
|368c6c730842d7801...|               58358|
|53759a2ecddad2bb8...|               52654|
|53b36df67ebb7c415...|               52105|
|154e7e31ebfa09220...|               42700|
|3dd2a17168ec895c7...|               40787|
+--------------------+--------------------+



                                                                                

In [29]:
# Top 10 customers by spending

top_10

NameError: name 'top_10' is not defined

**window function and Ranking**

In [None]:
from pyspark.sql.window import Window

In [None]:
# Dense Rank for seller based on Revenue

window_spec = Window.partitionBy("seller_id").orderBy(desc("price"))

In [None]:
# Rank Top Selling Products per Seller

top_seller_products_df = full_orders_df.withColumn("rank",rank().over(window_spec)).filter(col("rank")<=5)

In [None]:
top_seller_products_df.select("seller_id","price","rank").show(5)