In [1]:
spark

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
.appName('datacleaning')\
.getOrCreate()

25/09/15 13:29:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [6]:
from pyspark.sql.functions import *

In [3]:
hdfs_path = "/data/olist/"

In [5]:
customers_df = spark.read.csv(hdfs_path+"olist_customers_dataset.csv", header=True, inferSchema=True)
geo_df = spark.read.csv(hdfs_path+"olist_geolocation_dataset.csv", header=True, inferSchema=True)
items_df = spark.read.csv(hdfs_path+"olist_order_items_dataset.csv", header=True, inferSchema=True)
payments_df = spark.read.csv(hdfs_path+"olist_order_payments_dataset.csv", header=True, inferSchema=True)
reviews_df = spark.read.csv(hdfs_path+"olist_order_reviews_dataset.csv", header=True, inferSchema=True)
orders_df = spark.read.csv(hdfs_path+"olist_orders_dataset.csv", header=True, inferSchema=True)
products_df = spark.read.csv(hdfs_path+"olist_products_dataset.csv", header=True, inferSchema=True)
sellers_df = spark.read.csv(hdfs_path+"olist_sellers_dataset.csv", header=True, inferSchema=True)
category_name_df = spark.read.csv(hdfs_path+"product_category_name_translation.csv", header=True, inferSchema=True)

                                                                                

In [15]:
dfs = {
    "customers_df": customers_df,
    "geo_df": geo_df,
    "items_df": items_df,
    "payments_df": payments_df,
    "reviews_df": reviews_df,
    "orders_df": orders_df,
    "products_df": products_df,
    "sellers_df": sellers_df,
    "category_name_df": category_name_df
}

In [9]:
def missing_values(df, df_name):
    print(f"Missing Values for: {df_name}")
    df.select([count(when(col(c).isNull(),1)).alias(c) for c in df.columns]).show()

In [16]:
for name,df in dfs.items():
    missing_values(df, name)
    print()

Missing Values for: customers_df
+-----------+------------------+------------------------+-------------+--------------+
|customer_id|customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+-----------+------------------+------------------------+-------------+--------------+
|          0|                 0|                       0|            0|             0|
+-----------+------------------+------------------------+-------------+--------------+


Missing Values for: geo_df


                                                                                

+---------------------------+---------------+---------------+----------------+-----------------+
|geolocation_zip_code_prefix|geolocation_lat|geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+---------------+---------------+----------------+-----------------+
|                          0|              0|              0|               0|                0|
+---------------------------+---------------+---------------+----------------+-----------------+


Missing Values for: items_df
+--------+-------------+----------+---------+-------------------+-----+-------------+
|order_id|order_item_id|product_id|seller_id|shipping_limit_date|price|freight_value|
+--------+-------------+----------+---------+-------------------+-----+-------------+
|       0|            0|         0|        0|                  0|    0|            0|
+--------+-------------+----------+---------+-------------------+-----+-------------+


Missing Values for: payments_df
+--------+----------

In [17]:
orders_df_cleaned = orders_df.na.drop(subset=['order_id', 'customer_id', 'order_status'])

In [19]:
orders_df_cleaned.show(5)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [20]:
orders_df_cleaned = orders_df.fillna({'order_delivered_carrier_date': '9999-12-12'})

In [26]:
orders_df_cleaned.show(10)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [27]:
payments_df.show(5)

+--------------------+------------------+------------+--------------------+-------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------------------+------------------+------------+--------------------+-------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|        99.33|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|
|42fdf880ba16b47b5...|                 1| credit_card|                   2|       128.45|
+--------------------+------------------+------------+--------------------+-------------+
only showing top 5 rows



In [29]:
payments_df_null = payments_df.withColumn('payment_value', when(col('payment_value')!=99.33,col('payment_value')).otherwise(lit(None)))

In [32]:
payments_df_null.show(5)

+--------------------+------------------+------------+--------------------+-------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------------------+------------------+------------+--------------------+-------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|         NULL|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|
|42fdf880ba16b47b5...|                 1| credit_card|                   2|       128.45|
+--------------------+------------------+------------+--------------------+-------------+
only showing top 5 rows



In [35]:
from pyspark.ml.feature import Imputer

imp = Imputer(inputCols = ['payment_value'], outputCols=['payment_value_imputed']).setStrategy('median')
payments_df_cleaned = imp.fit(payments_df_null).transform(payments_df_null)

                                                                                

In [36]:
payments_df_cleaned.show(5)

+--------------------+------------------+------------+--------------------+-------------+---------------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|payment_value_imputed|
+--------------------+------------------+------------+--------------------+-------------+---------------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|         NULL|                100.0|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|                24.39|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|                65.71|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|               107.78|
|42fdf880ba16b47b5...|                 1| credit_card|                   2|       128.45|               128.45|
+--------------------+------------------+------------+--------------------+-------------+---------------

In [38]:
payments_df_cleaned = payments_df.withColumn( 'payment_type',
    when(col('payment_type')=='boleto', 'Bank Transfer').
    when(col('payment_type')=='credit_card', 'Credit Card')
)

In [39]:
payments_df_cleaned.show(10)

+--------------------+------------------+-------------+--------------------+-------------+
|            order_id|payment_sequential| payment_type|payment_installments|payment_value|
+--------------------+------------------+-------------+--------------------+-------------+
|b81ef226f3fe1789b...|                 1|  Credit Card|                   8|        99.33|
|a9810da82917af2d9...|                 1|  Credit Card|                   1|        24.39|
|25e8ea4e93396b6fa...|                 1|  Credit Card|                   1|        65.71|
|ba78997921bbcdc13...|                 1|  Credit Card|                   8|       107.78|
|42fdf880ba16b47b5...|                 1|  Credit Card|                   2|       128.45|
|298fcdf1f73eb413e...|                 1|  Credit Card|                   2|        96.12|
|771ee386b001f0620...|                 1|  Credit Card|                   1|        81.16|
|3d7239c394a212faa...|                 1|  Credit Card|                   3|        51.84|

In [41]:
customers_df.show(5)

+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|06b8999e2fba1a1fb...|861eff4711a542e4b...|                   14409|              franca|            SP|
|18955e83d337fd6b2...|290c77bc529b7ac93...|                    9790|sao bernardo do c...|            SP|
|4e7b3e00288586ebd...|060e732b5b29e8181...|                    1151|           sao paulo|            SP|
|b2b6027bc5c5109e5...|259dac757896d24d7...|                    8775|     mogi das cruzes|            SP|
|4f2d8ab171c80ec83...|345ecd01c38d18a90...|                   13056|            campinas|            SP|
+--------------------+--------------------+------------------------+--------------------+--------------+
only showing top 5 rows



In [42]:
customers_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [43]:
customers_df_cleaned = customers_df.withColumn('customer_zip_code_prefix', col('customer_zip_code_prefix').cast('string'))

In [45]:
customers_df_cleaned.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [46]:
customers_df_cleaned = customers_df.dropDuplicates(['customer_id'])

In [47]:
orders_wit_details = orders_df.join(items_df, 'order_id', 'left')\
.join(payments_df, 'order_id', 'left')\
.join(customers_df, 'customer_id', 'left')

In [49]:
orders_wit_details.show(5)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+------------------+------------+--------------------+-------------+--------------------+------------------------+-------------+--------------+
|         customer_id|            order_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|payment_sequential|payment_type|payment_installments|payment_value|  customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+------------

                                                                                

In [52]:
orders_with_total = orders_wit_details.groupBy("order_id")\
.agg(sum('payment_value').alias("Total_price_per_order")).orderBy('Total_price_per_order', ascending=False)

In [53]:
orders_with_total.show(5)



+--------------------+---------------------+
|            order_id|Total_price_per_order|
+--------------------+---------------------+
|03caa2c082116e1d3...|            109312.64|
|ab14fdcfbe524636d...|    45256.00000000001|
|1b15974a0141d54e3...|   44048.000000000015|
|2cc9089445046817a...|             36489.24|
|e8fa22c3673b1dd17...|   30185.999999999993|
+--------------------+---------------------+
only showing top 5 rows



                                                                                

In [54]:
quantiles = items_df.approxQuantile('price', [0.01, 0.99], 0.0)
low, high = quantiles[0], quantiles[1]
low, high

                                                                                

(9.99, 890.0)

In [57]:
items_df_cleaned = items_df.filter((col('price')>=low) & (col('price')<=high))

In [58]:
items_df_cleaned.show(5)

+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35| 58.9|        13.29|
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13|239.9|        19.93|
|000229ec398224ef6...|            1|c777355d18b72b67a...|5b51032eddd242adc...|2018-01-18 14:48:30|199.0|        17.87|
|00024acbcdf0a6daa...|            1|7634da152a4610f15...|9d7a1d34a50524090...|2018-08-15 10:10:18|12.99|        12.79|
|00042b26cf59d7ce6...|            1|ac6c3623068f30de0...|df560393f3a51e745...|2017-02-13 13:57:51|199.9|        18.14|
+--------------------+-------------+------------

In [61]:
products_df_cleaned = products_df.withColumn(
    'product_size_category',
    when(col('product_weight_g')<500, 'Small')\
    .when(col('product_weight_g')>500, 'Medium')\
    .otherwise('Large')
)

In [62]:
products_df_cleaned.show(5)

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|product_size_category|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|1e9e8ef04dbcff454...|           perfumaria|                 40|                       287|                 1|             225|               16|               10|              14|                Small|
|3aa071139cb16b67c...|                artes|                 44|                       276|                 1|            1000|               30|               18|              20|        

In [63]:
total_revenue_per_seller = items_df.groupBy('seller_id')\
.agg(sum('price').alias('Total_revenue')).orderBy('Total_revenue', ascending=False)

In [64]:
total_revenue_per_seller.show(5)



+--------------------+------------------+
|           seller_id|     Total_revenue|
+--------------------+------------------+
|4869f7a5dfa277a7d...|229472.62999999913|
|53243585a1d6dc264...| 222776.0499999998|
|4a3ca9315b744ce9f...| 200472.9199999981|
|fa1c13f2614d7b5c4...|194042.02999999968|
|7c67e1448b00f6e96...|187923.89000000118|
+--------------------+------------------+
only showing top 5 rows



                                                                                

In [65]:
!hadoop fs -ls /data/

Found 1 items
drwxr-xr-x   - nileshnandan_ts hadoop          0 2025-09-15 05:11 /data/olist


In [66]:
!hadoop fs -mkdir /data/processed

In [67]:
orders_wit_details.write.mode('overwrite').parquet('/data/processed/orders_wit_details.parquet')

                                                                                

In [68]:
!hadoop fs -ls /data/processed

Found 1 items
drwxr-xr-x   - root hadoop          0 2025-09-15 15:47 /data/processed/orders_wit_details.parquet
