**Data Cleaning of E-Commerce data**

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
.appName("E-commerce data cleaning")\
.getOrCreate()

25/06/13 05:20:02 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
spark

In [1]:
hdfs_path = "/data/olist/"

In [2]:
customers_df = spark.read.csv(hdfs_path + "olist_customers_dataset.csv", header = True, inferSchema= True)

                                                                                

In [3]:
product_category_df = spark.read.csv(hdfs_path + "product_category_name_translation.csv", header = True, inferSchema= True)
geolocation_df = spark.read.csv(hdfs_path + "olist_geolocation_dataset.csv", header = True, inferSchema= True)
order_items_df = spark.read.csv(hdfs_path + "olist_order_items_dataset.csv", header = True, inferSchema= True)
payments_df = spark.read.csv(hdfs_path + "olist_order_payments_dataset.csv", header = True, inferSchema= True)
reviews_df = spark.read.csv(hdfs_path + "olist_order_reviews_dataset.csv", header = True, inferSchema= True)
orders_df = spark.read.csv(hdfs_path + "olist_orders_dataset.csv", header = True, inferSchema= True)
sellers_df = spark.read.csv(hdfs_path + "olist_sellers_dataset.csv", header = True, inferSchema= True)
products_df = spark.read.csv(hdfs_path + "olist_products_dataset.csv", header = True, inferSchema= True)

                                                                                

In [4]:
customers_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [5]:
from pyspark.sql.functions import *

In [6]:
# Identify missing values

def missing_values(df,df_name):
    print(f'Missing Values in {df_name} :')
    df.select([count(when(col(c).isNull(), 1)).alias(c) for c in df.columns]).show()

In [7]:
missing_values(customers_df,"customer")

Missing Values in customer :
+-----------+------------------+------------------------+-------------+--------------+
|customer_id|customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+-----------+------------------+------------------------+-------------+--------------+
|          0|                 0|                       0|            0|             0|
+-----------+------------------+------------------------+-------------+--------------+



In [8]:
missing_values(orders_df,"orders")

Missing Values in orders :




+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+
|order_id|customer_id|order_status|order_purchase_timestamp|order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+
|       0|          0|           0|                       0|              160|                        1783|                         2965|                            0|
+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+



                                                                                

In [9]:
missing_values(order_items_df,"order Items")

Missing Values in order Items :
+--------+-------------+----------+---------+-------------------+-----+-------------+
|order_id|order_item_id|product_id|seller_id|shipping_limit_date|price|freight_value|
+--------+-------------+----------+---------+-------------------+-----+-------------+
|       0|            0|         0|        0|                  0|    0|            0|
+--------+-------------+----------+---------+-------------------+-----+-------------+



**Handle Missing values**

#1. drop missing values (for non-critical columns)

#2. Fill missing values (for numerical columns)

#3. Impute missing values (for contunious data)



In [10]:
orders_df_cleaned = orders_df.na.drop(subset=['order_id',"customer_id","order_status"])

In [11]:
orders_df_cleaned.show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [12]:
# filling missing values

orders_df_cleaned = orders_df.fillna({"order_delivered_customer_date":"1999-12-31"})

In [13]:
orders_df_cleaned.show(10)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

**Impute Missing Values**

In [14]:
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=['payment_value'],outputCols=['payment_value_imputed']).setStrategy('mean')

payments_df_cleaned = imputer.fit(payments_df).transform(payments_df)

                                                          

In [15]:
payments_df_cleaned.show()

+--------------------+------------------+------------+--------------------+-------------+---------------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|payment_value_imputed|
+--------------------+------------------+------------+--------------------+-------------+---------------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|        99.33|                99.33|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|                24.39|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|                65.71|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|               107.78|
|42fdf880ba16b47b5...|                 1| credit_card|                   2|       128.45|               128.45|
|298fcdf1f73eb413e...|                 1| credit_card|                   2|        96.12|               

In [16]:
#Standardizing the format

def print_schema(df,df_name):
    print(f'Schema of {df_name}')
    df.printSchema()

In [17]:
print_schema(orders_df,"orders")

Schema of orders
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [18]:
print_schema(customers_df,"customers")

Schema of customers
root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [19]:
print_schema(payments_df,"payments")

Schema of payments
root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)



In [20]:
# To change the data type of a column

orders_df_example = orders_df_cleaned.withColumn(("order_purchase_timestamp") ,to_date(col("order_purchase_timestamp")))

In [21]:
orders_df_example .show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|              2017-10-02|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|              2018-07-24|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [22]:
# chANGING THE DATA

payments_df_cleaned = payments_df_cleaned.withColumn("payment_type",
                                                     when(col("payment_type")=="boleto","bank Transfer")
                                                     .when(col("payment_type")=="credit_card", "credit Card")
                                                     .otherwise("other"))

In [23]:
payments_df_cleaned.show()

+--------------------+------------------+-------------+--------------------+-------------+---------------------+
|            order_id|payment_sequential| payment_type|payment_installments|payment_value|payment_value_imputed|
+--------------------+------------------+-------------+--------------------+-------------+---------------------+
|b81ef226f3fe1789b...|                 1|  credit Card|                   8|        99.33|                99.33|
|a9810da82917af2d9...|                 1|  credit Card|                   1|        24.39|                24.39|
|25e8ea4e93396b6fa...|                 1|  credit Card|                   1|        65.71|                65.71|
|ba78997921bbcdc13...|                 1|  credit Card|                   8|       107.78|               107.78|
|42fdf880ba16b47b5...|                 1|  credit Card|                   2|       128.45|               128.45|
|298fcdf1f73eb413e...|                 1|  credit Card|                   2|        96.12|      

**Standardizing the format**

In [30]:
def print_Schema(df,df_name):
    print(f'schema of {df_name}:')
    df.printSchema()

In [31]:
print_schema(orders_df,"orders")

Schema of orders
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [32]:
print_schema(customers_df,"customer")

Schema of customer
root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [33]:
print_schema(payments_df,"payments")

Schema of payments
root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)



In [35]:
customers_df_clean = customers_df.withColumn("customer_zip_code_prefix",col("customer_zip_code_prefix").cast("string"))

In [36]:
customers_df_clean.show(5)

+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|06b8999e2fba1a1fb...|861eff4711a542e4b...|                   14409|              franca|            SP|
|18955e83d337fd6b2...|290c77bc529b7ac93...|                    9790|sao bernardo do c...|            SP|
|4e7b3e00288586ebd...|060e732b5b29e8181...|                    1151|           sao paulo|            SP|
|b2b6027bc5c5109e5...|259dac757896d24d7...|                    8775|     mogi das cruzes|            SP|
|4f2d8ab171c80ec83...|345ecd01c38d18a90...|                   13056|            campinas|            SP|
+--------------------+--------------------+------------------------+--------------------+--------------+
only showing top 5 rows



In [38]:
# Remove duplicate record

customers_df_clean = customers_df_clean.dropDuplicates(["customer_id"])

In [39]:
customers_df_clean.show()



+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|00012a2ce6f8dcda2...|248ffe10d632bebe4...|                    6273|              osasco|            SP|
|000161a058600d590...|b0015e09bb4b6e47c...|                   35550|         itapecerica|            MG|
|000379cdec6255224...|0b83f73b19c2019e1...|                    4841|           sao paulo|            SP|
|0004164d20a9e969a...|104bdb7e6a6cdceaa...|                   13272|            valinhos|            SP|
|000419c5494106c30...|14843983d4a159080...|                   24220|             niteroi|            RJ|
|00050bf6e01e69d5c...|e3cf594a99e810f58...|                   98700|                ijui|            RS|
|00072d033fe2e5906...|b7c13491fd2aecd93...|            

                                                                                

In [43]:
order_with_details = orders_df_cleaned.join(order_items_df,"order_id","left")\
.join(payments_df_cleaned,"order_id","left")\
.join(customers_df_clean,"customer_id","left")

In [45]:
order_with_details.show(5)

[Stage 53:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+------------------+-------------+--------------------+-------------+---------------------+--------------------+------------------------+-------------+--------------+
|         customer_id|            order_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|payment_sequential| payment_type|payment_installments|payment_value|payment_value_imputed|  customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+--------------------+--------------------+------------+------------------------+---------------

                                                                                

In [48]:
order_with_total_value = order_with_details.groupBy("order_id")\
.agg(sum("payment_value").alias("Total_Order_value"))

In [50]:
order_with_total_value.show(5)



+--------------------+-----------------+
|            order_id|Total_Order_value|
+--------------------+-----------------+
|118045506e1c1dda0...|           1802.0|
|f44cb69655f8e4d13...|           164.32|
|edcc6b79e8394346b...|           162.63|
|9f98d6530155e3b38...|           316.76|
|949280c70c6d62ec9...|            49.42|
+--------------------+-----------------+
only showing top 5 rows



                                                                                

In [51]:
# Advanced Transformation

In [57]:
quantiles = order_items_df.approxQuantile("price",[0.01,0.99],0.0)
low_cutoff,high_cutoff = quantiles[0],quantiles[1]

In [58]:
low_cutoff,high_cutoff

(9.99, 890.0)

In [55]:
order_items_df.select("price").summary().show()



+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|            112650|
|   mean|120.65373901471354|
| stddev|183.63392805026012|
|    min|              0.85|
|    25%|              39.9|
|    50%|             74.99|
|    75%|             134.9|
|    max|            6735.0|
+-------+------------------+



                                                                                

In [62]:
order_items_cleand_quantile = order_items_df.filter((col("price")>= low_cutoff) & (col("price")<= high_cutoff))

In [63]:
order_items_cleand_quantile.show(5)

+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35| 58.9|        13.29|
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13|239.9|        19.93|
|000229ec398224ef6...|            1|c777355d18b72b67a...|5b51032eddd242adc...|2018-01-18 14:48:30|199.0|        17.87|
|00024acbcdf0a6daa...|            1|7634da152a4610f15...|9d7a1d34a50524090...|2018-08-15 10:10:18|12.99|        12.79|
|00042b26cf59d7ce6...|            1|ac6c3623068f30de0...|df560393f3a51e745...|2017-02-13 13:57:51|199.9|        18.14|
+--------------------+-------------+------------

In [66]:
payments_df_cleaned.select("payment_installments").summary().show(5)

+-------+--------------------+
|summary|payment_installments|
+-------+--------------------+
|  count|              103886|
|   mean|   2.853348863176944|
| stddev|  2.6870506738564925|
|    min|                   0|
|    25%|                   1|
+-------+--------------------+
only showing top 5 rows



In [71]:
product_df_cleaned = products_df.withColumn("product_size_category",
                                           when(col("product_weight_g") <500, "small")
                                           .when(col("product_weight_g").between(500,2000),"Medium")
                                                                                        .otherwise("Large"))

In [72]:
product_df_cleaned.show(5)

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|product_size_category|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|1e9e8ef04dbcff454...|           perfumaria|                 40|                       287|                 1|             225|               16|               10|              14|                small|
|3aa071139cb16b67c...|                artes|                 44|                       276|                 1|            1000|               30|               18|              20|        

In [74]:
!hadoop fs -ls /data/olist

Found 9 items
-rw-r--r--   2 niteshsoni30 hadoop    9033957 2025-06-11 15:34 /data/olist/olist_customers_dataset.csv
-rw-r--r--   2 niteshsoni30 hadoop   61273883 2025-06-11 15:34 /data/olist/olist_geolocation_dataset.csv
-rw-r--r--   2 niteshsoni30 hadoop   15438671 2025-06-11 15:34 /data/olist/olist_order_items_dataset.csv
-rw-r--r--   2 niteshsoni30 hadoop    5777138 2025-06-11 15:34 /data/olist/olist_order_payments_dataset.csv
-rw-r--r--   2 niteshsoni30 hadoop   14451670 2025-06-11 15:34 /data/olist/olist_order_reviews_dataset.csv
-rw-r--r--   2 niteshsoni30 hadoop   17654914 2025-06-11 15:34 /data/olist/olist_orders_dataset.csv
-rw-r--r--   2 niteshsoni30 hadoop    2379446 2025-06-11 15:34 /data/olist/olist_products_dataset.csv
-rw-r--r--   2 niteshsoni30 hadoop     174703 2025-06-11 15:34 /data/olist/olist_sellers_dataset.csv
-rw-r--r--   2 niteshsoni30 hadoop       2613 2025-06-11 15:34 /data/olist/product_category_name_translation.csv


In [76]:
!hadoop fs -mkdir /data/olist_proc

In [77]:
order_with_details.write.mode("overwrite").parquet("/data/olist_proce/cleaned_data_parquet")

                                                                                

In [78]:
!hadoop fs -ls /data/olist_proce

Found 1 items
drwxr-xr-x   - root hadoop          0 2025-06-13 06:19 /data/olist_proce/cleaned_data_parquet


In [81]:
order_with_details.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)
 |-- payment_value_imputed: double (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- 

In [83]:
product_df_cleaned.write.mode("overwrite").parquet("/data/olist_proce/products_df_cleand_parquet")

                                                                                

In [84]:
!hadoop fs -ls /data/olist_proce

Found 2 items
drwxr-xr-x   - root hadoop          0 2025-06-13 06:19 /data/olist_proce/cleaned_data_parquet
drwxr-xr-x   - root hadoop          0 2025-06-13 06:21 /data/olist_proce/products_df_cleand_parquet


In [85]:
product_df_cleaned.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (nullable = true)
 |-- product_height_cm: integer (nullable = true)
 |-- product_width_cm: integer (nullable = true)
 |-- product_size_category: string (nullable = false)



###CREATE EXTERNAL TABLE cleaned_payments (
    product_id STRING,
    product_category_name STRING,
    product_name_lenght STRING,
    product_description_lenght INT,
    product_photos_qty INT,
    product_weight_g INT,
    product_length_cm INT,
    product_height_cm INT,
    product_width_cm INT,
    product_size_category STRING
)
STORED AS PARQUET
LOCATION "/data/olist_proce/product_df_cleaned.parquet";###

In [None]:
spark.stop()