In [125]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import os
import statistics as stat
import math
from matplotlib import pyplot as plt
import datetime
import warnings

In [126]:
#import all needed datasets
#since there are a lot of datasets, I'm not going to show each of them.
custdb = pd.read_csv('datasets/olist_customers_dataset.csv')
orderitems = pd.read_csv('datasets/olist_order_items_dataset.csv')
orderpmt = pd.read_csv('datasets/olist_order_payments_dataset.csv')
orderreview = pd.read_csv('datasets/olist_order_reviews_dataset.csv')
orderdet = pd.read_csv('datasets/olist_orders_dataset.csv')
productdb = pd.read_csv('datasets/olist_products_dataset.csv')
sellerdb = pd.read_csv('datasets/olist_sellers_dataset.csv')
prodtrans = pd.read_csv('datasets/product_category_name_translation.csv')
geolocdb = pd.read_csv('datasets/olist_geolocation_dataset.csv')

In [127]:
#check for NA values in category name then remove them to avoid errors
productdb[productdb.isna().any(axis=1)]

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
105,a41e356c76fab66334f36de622ecbd3a,,,,,650.0,17.0,14.0,12.0
128,d8dee61c2034d6d075997acef1870e9b,,,,,300.0,16.0,7.0,20.0
145,56139431d72cd51f19eb9f7dae4d1617,,,,,200.0,20.0,20.0,20.0
154,46b48281eb6d663ced748f324108c733,,,,,18500.0,41.0,30.0,41.0
197,5fb61f482620cb672f5e586bb132eae9,,,,,300.0,35.0,7.0,12.0
...,...,...,...,...,...,...,...,...,...
32515,b0a0c5dd78e644373b199380612c350a,,,,,1800.0,30.0,20.0,70.0
32589,10dbe0fbaa2c505123c17fdc34a63c56,,,,,800.0,30.0,10.0,23.0
32616,bd2ada37b58ae94cc838b9c0569fecd8,,,,,200.0,21.0,8.0,16.0
32772,fa51e914046aab32764c41356b9d4ea4,,,,,1300.0,45.0,16.0,45.0


In [128]:
#change product category to english, we use the english version from this point on
#using inner join immediately remove rows with no category name
engprod = pd.merge(productdb, prodtrans, on = 'product_category_name')
engprod = engprod.drop('product_category_name', axis = 1)

#check once more for incomplete rows
engprod[engprod.isna().any(axis=1)]

Unnamed: 0,product_id,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english
4037,09ff539a621711667c43eba6a3bd8466,60.0,865.0,3.0,,,,,baby


Since we're going to focus on the quantity in sales, we are going to keep this row.

Now, let's clean the other datasets, starting from finding incomplete rows. Below are the datasets I've found that had incomplete rows. The datasets that aren't listed below has complete rows, meaning no NA values.

In [129]:
#ORDER REVIEWS
orderreview[orderreview.isna().any(axis=1)]

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53
...,...,...,...,...,...,...,...
99995,f3897127253a9592a73be9bdfdf4ed7a,22ec9f0669f784db00fa86d035cf8602,5,,,2017-12-09 00:00:00,2017-12-11 20:06:42
99996,b3de70c89b1510c4cd3d0649fd302472,55d4004744368f5571d1f590031933e4,5,,"Excelente mochila, entrega super rápida. Super...",2018-03-22 00:00:00,2018-03-23 09:10:43
99997,1adeb9d84d72fe4e337617733eb85149,7725825d039fc1f0ceb7635e3f7d9206,4,,,2018-07-01 00:00:00,2018-07-02 12:59:13
99998,be360f18f5df1e0541061c87021e6d93,f8bd3f2000c28c5342fedeb5e50f2e75,1,,Solicitei a compra de uma capa de retrovisor c...,2017-12-15 00:00:00,2017-12-16 01:29:43


It is normal to have NA values in reviews because most of the time customers don't leave any review. So let's just keep it that way.

Now let's take a look at the order status in the order details dataset.

In [173]:
orderdet['order_status'].unique()

array(['delivered', 'invoiced', 'shipped', 'processing', 'unavailable',
       'canceled', 'created', 'approved'], dtype=object)

From wrangling the data, here is what we can derive in regards of the normal flow of shipment:

Created -> Invoiced -> Approved -> Processing -> Shipped -> Delivered

with noting that:
- cancelation can occur anytime
- unavailable meaning the shipping could be delayed

After wrangling some more, this dataset shows the latest condition for each order_id, hence the dataset has unique records. Since our goal is to understand the purchasing behaviour, let's consider the delivered orders for now.

In [132]:
orderdets=orderdet[orderdet['order_status']=='delivered'].sort_values(by=['order_id'])
orderdets[orderdets.isna().any(axis=1)]

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
82868,0d3268bad9b086af767785e3f0fc0133,4f1d63d35fb7c8999853b2699f5c7649,delivered,2018-07-01 21:14:02,2018-07-01 21:29:54,2018-07-03 09:28:00,,2018-07-24 00:00:00
23156,12a95a3c06dbaec84bcfb0e2da5d228a,1e101e0daffaddce8159d25a8e53f2b2,delivered,2017-02-17 13:05:55,,2017-02-22 11:23:11,2017-03-02 11:09:19,2017-03-20 00:00:00
98038,20edc82cf5400ce95e1afacc25798b31,28c37425f1127d887d7337f284080a0f,delivered,2018-06-27 16:09:12,2018-06-27 16:29:30,2018-07-03 19:26:00,,2018-07-19 00:00:00
73222,2aa91108853cecb43c84a5dc5b277475,afeb16c7f46396c0ed54acb45ccaaa40,delivered,2017-09-29 08:52:58,2017-09-29 09:07:16,,2017-11-20 19:44:47,2017-11-14 00:00:00
84999,2babbb4b15e6d2dfe95e2de765c97bce,74bebaf46603f9340e3b50c6b086f992,delivered,2017-02-18 17:15:03,,2017-02-22 11:23:11,2017-03-03 18:43:43,2017-03-31 00:00:00
3002,2d1e2d5bf4dc7227b3bfebb81328c15f,ec05a6d8558c6455f0cbbd8a420ad34f,delivered,2017-11-28 17:44:07,2017-11-28 17:56:40,2017-11-30 18:12:23,,2017-12-18 00:00:00
92643,2d858f451373b04fb5c984a1cc2defaf,e08caf668d499a6d643dafd7c5cc498a,delivered,2017-05-25 23:22:43,2017-05-25 23:30:16,,,2017-06-23 00:00:00
43834,2ebdfc4f15f23b91474edf87475f108e,29f0540231702fda0cfdee0a310f11aa,delivered,2018-07-01 17:05:11,2018-07-01 17:15:12,2018-07-03 13:57:00,,2018-07-30 00:00:00
61743,2eecb0d85f281280f79fa00f9cec1a95,a3d3c38e58b9d2dfb9207cab690b6310,delivered,2017-02-17 17:21:55,,2017-02-22 11:42:51,2017-03-03 12:16:03,2017-03-20 00:00:00
72407,3c0b8706b065f9919d0505d3b3343881,d85919cb3c0529589c6fa617f5f43281,delivered,2017-02-17 15:53:27,,2017-02-22 11:31:30,2017-03-03 11:47:47,2017-03-23 00:00:00


In [133]:
orderdets=orderdet[orderdet['order_status']=='unavailable'].sort_values(by=['order_id'])
orderdets[orderdets.isna().any(axis=1)]

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
35138,0010dedd556712d7bb69a19cb7bbd37a,3a92efdb6e6163dc1734d44f2f5f6d04,unavailable,2017-10-21 19:32:06,2017-10-24 03:25:32,,,2017-11-03 00:00:00
83798,00a500bc03bc4ec968e574c2553bed4b,3d2f26eab3f79dd1fe9977f615e70c2f,unavailable,2017-11-23 10:53:01,2017-11-25 10:54:38,,,2018-01-08 00:00:00
26047,00bca4adac549020c1273714d04d0208,1541ebabf956d17f3afe883790bd7dd3,unavailable,2017-02-14 14:49:29,2017-02-14 15:03:38,,,2017-03-13 00:00:00
79792,00daac8efd71674d62356c2a306d1e4c,2b99e152f2351410951c503a2f6a2d7e,unavailable,2017-11-21 19:20:25,2017-11-21 19:29:26,,,2017-12-12 00:00:00
2971,0130f0f71fb0e831d18e6a3b33a3a50c,e56f6dff7b8ac9813ddeb2312f0d834f,unavailable,2017-05-12 10:17:23,2017-05-13 03:10:13,,,2017-05-22 00:00:00
...,...,...,...,...,...,...,...,...
88392,fdcca0e15a4d03e3fb89fb14664a3744,58ee086556a160d66dc3f6d43b4efcc3,unavailable,2017-09-11 20:02:08,2017-09-11 20:15:25,,,2017-10-02 00:00:00
61608,fddbd183387b5c9bcbafbd0fe965301f,bade46d49478a63c01dca6a9ae49dfa9,unavailable,2017-09-23 10:25:00,2017-09-26 03:24:31,,,2017-10-18 00:00:00
40833,fe87d4b944748f63ca5ed22cc55b6fb6,d992eb012e1599214218191d39c99693,unavailable,2017-12-05 09:20:36,2017-12-05 15:14:07,,,2017-12-21 00:00:00
10180,feae5ecdf2cc16c1007741be785fe3cd,ca05f8b53b1ad2a64bf34baa8aa7f4f6,unavailable,2017-11-11 16:41:06,2017-11-11 16:56:15,,,2017-11-27 00:00:00


In [134]:
orderdets=orderdet[orderdet['order_status']=='canceled'].sort_values(by=['order_id'])
orderdets[orderdets.isna().any(axis=1)]

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
85228,00310b0c75bb13015ec4d82d341865a4,0dad07848c618cc5a4679a1bfe1db8d2,canceled,2018-08-15 14:29:08,2018-08-15 15:04:25,,,2018-08-29 00:00:00
24643,00ae7a8b4936674ebb701d4a23719a79,a7e8a3cb55d9c9a536992c6465a46a3f,canceled,2018-05-09 14:14:06,2018-05-09 15:51:32,,,2018-05-17 00:00:00
1130,00b1cb0320190ca0daa2c88b35206009,3532ba38a3fd242259a514ac2b6ae6b6,canceled,2018-08-28 15:26:39,,,,2018-09-12 00:00:00
16536,00d0ffd14774da775ac832ba8520510f,a2f3de2a0a84803cef018776a2e76c9b,canceled,2017-12-23 22:25:00,,,,2018-01-19 00:00:00
51435,00ff0cf5583758e6964723e42f111bf4,e3626ed5d1f2e3a02d692ec948b25eeb,canceled,2017-04-10 00:45:56,2017-04-10 01:03:29,,,2017-05-04 00:00:00
...,...,...,...,...,...,...,...,...
76489,fc3c882665c98c9b737a7b1b3aa6c553,01866d949d55c81c28d255114948b72b,canceled,2018-04-17 18:41:50,2018-04-17 18:55:18,,,2018-05-04 00:00:00
33650,fd4c3a2912e854eedd463b329540da4b,d2fdd1d3725bccaa0900e360a985398f,canceled,2017-10-28 11:18:48,,,,2017-11-21 00:00:00
48488,fdbbb1715d0c62c714e2a8178b95dd54,4eee43e4ae51748483930117ac32369d,canceled,2018-05-20 21:30:58,2018-05-20 21:50:38,,,2018-05-29 00:00:00
66461,fe9aa3b22b4d65ccbaffb57984bc12fb,e14603210bf5355859bea3b45bbad044,canceled,2017-11-14 19:30:10,2017-11-14 19:50:28,,,2017-11-28 00:00:00


In [135]:
orderdet.sort_values(by=['order_id'])

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
85267,00010242fe8c5a6d1ba2dd792cb16214,3ce436f183e68e07877b285a838db11a,delivered,2017-09-13 08:59:02,2017-09-13 09:45:35,2017-09-19 18:34:16,2017-09-20 23:43:48,2017-09-29 00:00:00
71853,00018f77f2f0320c557190d7a144bdd3,f6dd3ec061db4e3987629fe6b26e5cce,delivered,2017-04-26 10:53:06,2017-04-26 11:05:13,2017-05-04 14:35:00,2017-05-12 16:04:24,2017-05-15 00:00:00
6298,000229ec398224ef6ca0657da4fc703e,6489ae5e4333f3693df5ad4372dab6d3,delivered,2018-01-14 14:33:31,2018-01-14 14:48:30,2018-01-16 12:36:48,2018-01-22 13:19:16,2018-02-05 00:00:00
22550,00024acbcdf0a6daa1e931b038114c75,d4eb9395c8c0431ee92fce09860c5a06,delivered,2018-08-08 10:00:35,2018-08-08 10:10:18,2018-08-10 13:28:00,2018-08-14 13:32:39,2018-08-20 00:00:00
5247,00042b26cf59d7ce69dfabb4e55b4fd9,58dbd0b2d70206bf40e62cd34e84d795,delivered,2017-02-04 13:57:51,2017-02-04 14:10:13,2017-02-16 09:46:09,2017-03-01 16:42:31,2017-03-17 00:00:00
...,...,...,...,...,...,...,...,...
79550,fffc94f6ce00a00581880bf54a75a037,b51593916b4b8e0d6f66f2ae24f2673d,delivered,2018-04-23 13:57:06,2018-04-25 04:11:01,2018-04-25 12:09:00,2018-05-10 22:56:40,2018-05-18 00:00:00
70155,fffcd46ef2263f404302a634eb57f7eb,84c5d4fbaf120aae381fad077416eaa0,delivered,2018-07-14 10:26:46,2018-07-17 04:31:48,2018-07-17 08:05:00,2018-07-23 20:31:55,2018-08-01 00:00:00
52699,fffce4705a9662cd70adb13d4a31832d,29309aa813182aaddc9b259e31b870e6,delivered,2017-10-23 17:07:56,2017-10-24 17:14:25,2017-10-26 15:13:14,2017-10-28 12:22:22,2017-11-10 00:00:00
59871,fffe18544ffabc95dfada21779c9644f,b5e6afd5a41800fdf401e0272ca74655,delivered,2017-08-14 23:02:59,2017-08-15 00:04:32,2017-08-15 19:02:53,2017-08-16 21:59:40,2017-08-25 00:00:00


From these extracts, we understand about the order_status(es) that:
- Shipped: Shipped but not delivered
- Delivered: The product(s) reached the customer
- Unavailable: Unavailable for shipping
- Canceled: Order cancelation

The order items dataset has a unique way of tracking their purchases. Each order_id may have multiple order_item_id representing the same product. This is not a duplicate but merely represents the quantity of that product being purchased. For example, let's look at order_id '00571ded73b3c061925584feab0db425'.

In [162]:
orderitems[orderitems['order_id']=='f60ce04ff8060152c83c7c97e246d6a8']

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
108262,f60ce04ff8060152c83c7c97e246d6a8,1,58efb9b638561ce132216a9a612513e2,701938c450705b8ae65fc923b70f35c7,2017-12-04 22:31:28,109.97,34.04
108263,f60ce04ff8060152c83c7c97e246d6a8,2,872db866d615db59612ac933f43d6b22,701938c450705b8ae65fc923b70f35c7,2017-12-04 22:31:28,109.97,34.04
108264,f60ce04ff8060152c83c7c97e246d6a8,3,872db866d615db59612ac933f43d6b22,701938c450705b8ae65fc923b70f35c7,2017-12-04 22:31:28,109.97,34.04
108265,f60ce04ff8060152c83c7c97e246d6a8,4,58efb9b638561ce132216a9a612513e2,701938c450705b8ae65fc923b70f35c7,2017-12-04 22:31:28,109.97,34.04
108266,f60ce04ff8060152c83c7c97e246d6a8,5,872db866d615db59612ac933f43d6b22,701938c450705b8ae65fc923b70f35c7,2017-12-04 22:31:28,109.97,34.04
108267,f60ce04ff8060152c83c7c97e246d6a8,6,872db866d615db59612ac933f43d6b22,701938c450705b8ae65fc923b70f35c7,2017-12-04 22:31:28,109.97,34.04
108268,f60ce04ff8060152c83c7c97e246d6a8,7,58efb9b638561ce132216a9a612513e2,701938c450705b8ae65fc923b70f35c7,2017-12-04 22:31:28,109.97,34.04
108269,f60ce04ff8060152c83c7c97e246d6a8,8,872db866d615db59612ac933f43d6b22,701938c450705b8ae65fc923b70f35c7,2017-12-04 22:31:28,109.97,34.04
108270,f60ce04ff8060152c83c7c97e246d6a8,9,58efb9b638561ce132216a9a612513e2,701938c450705b8ae65fc923b70f35c7,2017-12-04 22:31:28,109.97,34.04
108271,f60ce04ff8060152c83c7c97e246d6a8,10,58efb9b638561ce132216a9a612513e2,701938c450705b8ae65fc923b70f35c7,2017-12-04 22:31:28,109.97,34.04


But on the other hand, the order_item_id, which represents the number of products being purchased, might also note for multiple products. Here are order_id '005d9a5423d47281ac463a968b3936fb' for reference.

In [137]:
orderitems[orderitems['order_id']=='005d9a5423d47281ac463a968b3936fb']

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
151,005d9a5423d47281ac463a968b3936fb,1,fb7a100ec8c7b34f60cec22b1a9a10e0,d98eec89afa3380e14463da2aabaea72,2017-10-24 12:28:16,49.99,18.12
152,005d9a5423d47281ac463a968b3936fb,2,4c3ae5db49258df0784827bdacf3b396,d98eec89afa3380e14463da2aabaea72,2017-10-24 12:28:16,24.99,13.58
153,005d9a5423d47281ac463a968b3936fb,3,4c3ae5db49258df0784827bdacf3b396,d98eec89afa3380e14463da2aabaea72,2017-10-24 12:28:16,24.99,13.58


In [164]:
orderqty=pd.DataFrame({})
tempdf = orderitems[['order_id','product_id']]
tempdf = tempdf.sort_values(['order_id','product_id'], ascending = ['False','False'])
orderqty['orderproduct']=list(zip(tempdf['order_id'],tempdf['product_id']))

In [165]:
item_qty = []
order_id=[]
prod_id=[]
i=0
for (orderid,prodid) in orderqty['orderproduct']:
    if i == 0:
        order_id.append(orderid)
        prod_id.append(prodid)
        cur_count = 1
        item_qty.append(cur_count)
    else:
        if orderqty['orderproduct'][i]==orderqty['orderproduct'][i-1]:
            cur_count = cur_count+1
            item_qty[-1]=cur_count
        else:
            order_id.append(orderid)
            prod_id.append(prodid)
            cur_count = 1
            item_qty.append(cur_count)
    i=i+1

orderqty = pd.DataFrame({})
orderqty['order_id']=order_id
orderqty['product_id']=prod_id
orderqty['item_qty']=item_qty
orderqty

Unnamed: 0,order_id,product_id,item_qty
0,00010242fe8c5a6d1ba2dd792cb16214,4244733e06e7ecb4970a6e2683c13e61,1
1,00018f77f2f0320c557190d7a144bdd3,e5f2d52b802189ee658865ca93d83a8f,1
2,000229ec398224ef6ca0657da4fc703e,c777355d18b72b67abbeef9df44fd0fd,1
3,00024acbcdf0a6daa1e931b038114c75,7634da152a4610f1595efa32f14722fc,1
4,00042b26cf59d7ce69dfabb4e55b4fd9,ac6c3623068f30de03045865e4e10089,1
...,...,...,...
102420,fffc94f6ce00a00581880bf54a75a037,4aa6014eceb682077f9dc4bffebc05b0,1
102421,fffcd46ef2263f404302a634eb57f7eb,32e07fd915822b0765e448c4dd74c828,1
102422,fffce4705a9662cd70adb13d4a31832d,72a30483855e2eafc67aee5dc2560482,1
102423,fffe18544ffabc95dfada21779c9644f,9c422a519119dcad7575db5af1ba540e,1


In [166]:
orderqty.to_csv('order_qty.csv',index=False)
#orderqty.drop_duplicates(subset=['order_id','product_id'])

In [172]:
orderitems=orderitems.drop('order_item_id', axis = 1).sort_values(['order_id','product_id'], ascending = ['False','False']).drop_duplicates(inplace=False)
orderitems = pd.merge(orderitems, orderqty, on = ['order_id','product_id'])
orderitems

Unnamed: 0,order_id,product_id,seller_id,shipping_limit_date,price,freight_value,item_qty
0,00010242fe8c5a6d1ba2dd792cb16214,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.90,13.29,1
1,00018f77f2f0320c557190d7a144bdd3,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.90,19.93,1
2,000229ec398224ef6ca0657da4fc703e,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.00,17.87,1
3,00024acbcdf0a6daa1e931b038114c75,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79,1
4,00042b26cf59d7ce69dfabb4e55b4fd9,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.90,18.14,1
...,...,...,...,...,...,...,...
102420,fffc94f6ce00a00581880bf54a75a037,4aa6014eceb682077f9dc4bffebc05b0,b8bc237ba3788b23da09c0f1f3a3288c,2018-05-02 04:11:01,299.99,43.41,1
102421,fffcd46ef2263f404302a634eb57f7eb,32e07fd915822b0765e448c4dd74c828,f3c38ab652836d21de61fb8314b69182,2018-07-20 04:31:48,350.00,36.53,1
102422,fffce4705a9662cd70adb13d4a31832d,72a30483855e2eafc67aee5dc2560482,c3cfdc648177fdbbbb35635a37472c53,2017-10-30 17:14:25,99.90,16.95,1
102423,fffe18544ffabc95dfada21779c9644f,9c422a519119dcad7575db5af1ba540e,2b3e4a2a3ea8e01938cabda2a3e5cc79,2017-08-21 00:04:32,55.99,8.72,1
