In [4]:
import pandas as pd
import numpy as np
from scipy import stats 
import os
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.metrics import r2_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import datetime
import random

In [5]:
file_cust = pd.read_csv('ecommerce_data/olist_customers_dataset.csv')
file_geo = pd.read_csv('ecommerce_data/olist_geolocation_dataset.csv')
file_items = pd.read_csv('ecommerce_data/olist_order_items_dataset.csv')
file_payments = pd.read_csv('ecommerce_data/olist_order_payments_dataset.csv')
file_reviews = pd.read_csv('ecommerce_data/olist_order_reviews_dataset.csv')
file_orders = pd.read_csv('ecommerce_data/olist_orders_dataset.csv')
file_products = pd.read_csv('ecommerce_data/olist_products_dataset.csv')
file_sellers = pd.read_csv('ecommerce_data/olist_sellers_dataset.csv')
file_category = pd.read_csv('ecommerce_data/product_category_name_translation.csv')

In [16]:
# file_cust = file_cust.dropna()
# file_geo = file_geo.dropna()
# file_items = file_items.dropna()
# file_payments = file_payments.dropna()
# file_reviews = file_reviews.dropna()
# file_orders = file_orders.dropna()
# file_products = file_products.dropna()
# file_sellers = file_sellers.dropna()
# file_category = file_category.dropna()

In [6]:
data = file_orders.merge(file_items, on='order_id',how='left')
data = data.merge(file_payments, on='order_id',how='outer',validate='m:m')
data = data.merge(file_reviews, on='order_id',how='outer')
data = data.merge(file_products,on='product_id',how='outer')
data = data.merge(file_cust,on='customer_id',how='outer')
data = data.merge(file_sellers, on='seller_id',how='outer')
data = data.merge(file_category, on='product_category_name',how='outer')

In [18]:
data = data.dropna()

In [19]:
data.columns

Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date',
       'price', 'freight_value', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value', 'review_id', 'review_score',
       'review_comment_title', 'review_comment_message',
       'review_creation_date', 'review_answer_timestamp',
       'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'customer_unique_id', 'customer_zip_code_prefix', 'customer_city',
       'customer_state', 'seller_zip_code_prefix', 'seller_city',
       'seller_state', 'product_category_name_english'],
      dtype='object')

In [20]:
#desired columns
data['order_id']
data['customer_id']
data['order_purchase_timestamp']
data['order_item_id']
data['product_id']
data['seller_id']
data['price']

11         64.99
12         29.99
40         43.98
77         49.90
80         49.90
           ...  
121896     15.90
121988     19.99
121989     18.99
121990     18.99
121999    749.00
Name: price, Length: 11743, dtype: float64

In [21]:
data['order_purchase_timestamp'] = pd.to_datetime(data['order_purchase_timestamp'])
data['order_approved_at'] = pd.to_datetime(data['order_approved_at'])
data['order_delivered_carrier_date'] = pd.to_datetime(data['order_delivered_carrier_date'])
data['order_delivered_customer_date'] = pd.to_datetime(data['order_delivered_customer_date'])
data['order_estimated_delivery_date'] = pd.to_datetime(data['order_estimated_delivery_date'])
data['shipping_limit_date'] = pd.to_datetime(data['shipping_limit_date'])
data['review_creation_date'] = pd.to_datetime(data['review_creation_date'])
data['review_answer_timestamp'] = pd.to_datetime(data['review_answer_timestamp'])

In [22]:
order_dates, order_times = zip(*[(d.date(), d.time()) for d in data['order_purchase_timestamp']])
data = data.assign(order_date=order_dates, order_time=order_times)

In [23]:
unique_orders = data.drop_duplicates(subset='order_id')
unique_orders

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_item_id,product_id,...,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,seller_zip_code_prefix,seller_city,seller_state,product_category_name_english,order_date,order_time
11,4382f48f29370e70ec4ef6aa3578e1c9,e5cede01799d43206f0e40fcfe683457,delivered,2018-07-29 21:59:48,2018-07-30 18:31:39,2018-07-31 06:14:00,2018-08-03 22:09:34,2018-08-10,1.0,883cea107372a7f07b5830904f74952d,...,59f5e5c0ff8a1ca7026faa8c316693eb,17900.0,dracena,SP,13482.0,limeira,SP,housewares,2018-07-29,21:59:48
12,ff8f8f9123b73422b7a732a02483d9e2,6a029307b24c4e85eb7f8b54d28f531b,delivered,2018-08-18 16:29:06,2018-08-18 16:50:16,2018-08-21 11:27:00,2018-08-24 20:28:43,2018-09-10,1.0,22c28492de598515578d252acb18e21b,...,ef3619723058855152c4c48778a0e656,39900.0,almenara,MG,13482.0,limeira,SP,housewares,2018-08-18,16:29:06
40,e4e6269506ecfa4a94ac0188f2aa387c,9aad35717d9bc6ccafae1631ea8f0564,delivered,2018-06-15 20:18:12,2018-06-15 20:41:19,2018-06-16 08:03:00,2018-06-21 19:41:56,2018-07-13,1.0,a35c14eaa384acf7aabbbec3bd76fae6,...,45d1c56383e4e977087798ada78ac371,13425.0,piracicaba,SP,88301.0,itajai,SP,housewares,2018-06-15,20:18:12
77,874eef9a84653f570656bdb0f8519151,c59929869ce404450d73c9a018c3d3a8,delivered,2018-06-02 19:04:54,2018-06-02 19:15:17,2018-06-04 14:14:00,2018-06-12 19:36:30,2018-07-12,1.0,3d3c4219b64b1a968490e7bdfa43bf1f,...,4239137bc284d7ec558ad51001956b40,4205.0,sao paulo,SP,88301.0,itajai,SP,housewares,2018-06-02,19:04:54
80,296752c5e35a99dd06b9100ab400c8eb,40fa5cb283fa23003378da712ee22419,delivered,2018-06-23 12:19:45,2018-06-25 13:17:33,2018-06-26 14:03:00,2018-07-04 00:48:49,2018-08-06,1.0,3d3c4219b64b1a968490e7bdfa43bf1f,...,d8b877fb7950380c8d5d3093e334fcb7,35164.0,ipatinga,MG,88301.0,itajai,SP,housewares,2018-06-23,12:19:45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121896,cd38eeff6b0ebffe1200329846d83987,209cd46e6fbad8005f5a20c86a4db93c,delivered,2018-08-20 08:02:20,2018-08-20 14:11:06,2018-08-20 16:11:00,2018-08-23 19:12:52,2018-08-29,1.0,c3798d484fb730f0b5c23af0d5361595,...,d42b37e7396133bc59162e1cfea39f2f,14775.0,jaborandi,SP,7152.0,guarulhos,SP,cine_photo,2018-08-20,08:02:20
121988,493ba21a937e956c5e0e4e26c9f1b7f9,408d74d06be5d5140bb933f248de1342,delivered,2018-07-15 09:56:32,2018-07-16 13:30:36,2018-07-27 07:26:00,2018-08-02 18:06:41,2018-07-26,1.0,82d7b276f49e72ffce78d10b20518808,...,8fc8e0f5ea35f5b26e2e54f23aa2b272,4851.0,sao paulo,SP,5201.0,sao paulo,SP,books_imported,2018-07-15,09:56:32
121989,4a7cf245701068d38d441791b735e4bd,1d3b24ba06f2e3fa4bfa74fd70d2310f,delivered,2018-05-07 20:36:47,2018-05-07 20:51:50,2018-05-08 15:04:00,2018-05-09 22:38:53,2018-05-17,1.0,82d7b276f49e72ffce78d10b20518808,...,8f257a3fa12c0873312984301ef06d05,18047.0,sorocaba,SP,5201.0,sao paulo,SP,books_imported,2018-05-07,20:36:47
121990,33f8dfc5a51063c31d7b12e9d43a45fc,8f7f4871646eb2fa410cc2d50b861317,delivered,2018-04-28 14:32:54,2018-04-28 14:53:18,2018-05-02 15:25:00,2018-05-03 23:51:56,2018-05-14,1.0,82d7b276f49e72ffce78d10b20518808,...,7da3054afe371affc14d26cc2cc04358,3574.0,sao paulo,SP,5201.0,sao paulo,SP,books_imported,2018-04-28,14:32:54


In [24]:
unique_orders['customer_unique_id'].value_counts()

9c08d6f85c7fcec4b08ead25590c0af7    3
6a9e15d6fa8ce1cabf193c21aa577f64    3
cfa69922f9968e0e6271647abda09b09    3
08e5b38d7948d37fbb2a59fc5e175ab1    3
d132b863416f85f2abb1a988ca05dd12    2
                                   ..
2c87693b95677a97ac5a6e396e717b69    1
c1a96eec1e40c53e2b91741abc83ddb9    1
62e32e0b239b36c93427f7910eb6a173    1
af58c69736e0aa643594632082d00870    1
d370a413e1f40d4798d3b33f16896a04    1
Name: customer_unique_id, Length: 9446, dtype: int64

In [25]:
df = unique_orders['customer_unique_id'].value_counts().rename_axis('unique_values').reset_index(name='counts')

In [26]:
df

Unnamed: 0,unique_values,counts
0,9c08d6f85c7fcec4b08ead25590c0af7,3
1,6a9e15d6fa8ce1cabf193c21aa577f64,3
2,cfa69922f9968e0e6271647abda09b09,3
3,08e5b38d7948d37fbb2a59fc5e175ab1,3
4,d132b863416f85f2abb1a988ca05dd12,2
...,...,...
9441,2c87693b95677a97ac5a6e396e717b69,1
9442,c1a96eec1e40c53e2b91741abc83ddb9,1
9443,62e32e0b239b36c93427f7910eb6a173,1
9444,af58c69736e0aa643594632082d00870,1


In [35]:
df1 = df.loc[df['counts'] > 1]

In [36]:
df1.tail()

Unnamed: 0,unique_values,counts
96,3a4acf4c4c97cc25075d29f8d29f405b,2
97,78dc91ac9a6fece2e327f74e96cac0b0,2
98,b03155da610163a1626f1b99d4787774,2
99,683459cab0129375a071efb00da7c191,2
100,03ef7f03905227e15c7b6c860554b3cb,2


In [37]:
df1['counts'].sum()

206

In [40]:
repeat_list = df1['unique_values'].tolist()
print(repeat_list)

['9c08d6f85c7fcec4b08ead25590c0af7', '6a9e15d6fa8ce1cabf193c21aa577f64', 'cfa69922f9968e0e6271647abda09b09', '08e5b38d7948d37fbb2a59fc5e175ab1', 'd132b863416f85f2abb1a988ca05dd12', '65571628698194dbda526c22f5ed2317', '1d39a7704cfb79b7cd820bc83e3de161', '02e9109b7e0a985108b43e573b6afb23', '08d8ad84b0088cbeae77da4ff3817479', '7305430719d715992b00be82af4a6aa8', 'a1874c5550d2f0bc14cc122164603713', '013ef03e0f3f408dd9bf555e4edcdc0a', 'c50794dfc62b62a84f72475abf38b4e3', 'dd9b235dbe7d48c82caca4da097a2240', '1772c0fbee58b154da028487de51b74c', '83915016a26b0ca42e3caf1070664d1f', 'a900ff99a192cbaf6ae8e867a95117a3', 'fc24db02becd484accefaa5af59c18b1', '4f4943d1a8ac2c4a65d4fba25ee8c0bb', 'e6f7ddad5d0159e391e27db652c17246', '415b073d153528fa1ed35a42e39945da', '6f5c52ea47e32b73958b0ac0f3c34e88', '31d332d20895c5c0baefe55207a0fead', '84f30d2fd3937449dc7e0c835c281961', '7ad04c71bfca958e6f2ec44bce34e2da', '4badd97df3e4ad6d64508b5c25d4dca9', 'f6dc4789e4669873c35f8f92fdbed89f', '28aea3e12549a3611bbd8d4e33

In [41]:
len(repeat_list)

101

In [31]:
len(data['customer_unique_id'].unique())

9446

In [42]:
data["repeat?"] = np.where(data["customer_unique_id"].isin(repeat_list), 1, 0)


In [44]:
data.tail()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_item_id,product_id,...,customer_zip_code_prefix,customer_city,customer_state,seller_zip_code_prefix,seller_city,seller_state,product_category_name_english,order_date,order_time,repeat?
121896,cd38eeff6b0ebffe1200329846d83987,209cd46e6fbad8005f5a20c86a4db93c,delivered,2018-08-20 08:02:20,2018-08-20 14:11:06,2018-08-20 16:11:00,2018-08-23 19:12:52,2018-08-29,1.0,c3798d484fb730f0b5c23af0d5361595,...,14775.0,jaborandi,SP,7152.0,guarulhos,SP,cine_photo,2018-08-20,08:02:20,0
121988,493ba21a937e956c5e0e4e26c9f1b7f9,408d74d06be5d5140bb933f248de1342,delivered,2018-07-15 09:56:32,2018-07-16 13:30:36,2018-07-27 07:26:00,2018-08-02 18:06:41,2018-07-26,1.0,82d7b276f49e72ffce78d10b20518808,...,4851.0,sao paulo,SP,5201.0,sao paulo,SP,books_imported,2018-07-15,09:56:32,0
121989,4a7cf245701068d38d441791b735e4bd,1d3b24ba06f2e3fa4bfa74fd70d2310f,delivered,2018-05-07 20:36:47,2018-05-07 20:51:50,2018-05-08 15:04:00,2018-05-09 22:38:53,2018-05-17,1.0,82d7b276f49e72ffce78d10b20518808,...,18047.0,sorocaba,SP,5201.0,sao paulo,SP,books_imported,2018-05-07,20:36:47,0
121990,33f8dfc5a51063c31d7b12e9d43a45fc,8f7f4871646eb2fa410cc2d50b861317,delivered,2018-04-28 14:32:54,2018-04-28 14:53:18,2018-05-02 15:25:00,2018-05-03 23:51:56,2018-05-14,1.0,82d7b276f49e72ffce78d10b20518808,...,3574.0,sao paulo,SP,5201.0,sao paulo,SP,books_imported,2018-04-28,14:32:54,0
121999,73d60420cd1a179b2d8887d538efe4c2,65d9213dfa004c17dc126503e106e4a3,delivered,2018-08-07 10:52:58,2018-08-07 11:05:18,2018-08-08 08:31:00,2018-08-22 17:02:23,2018-09-04,1.0,078b7149a32b479d3cbf1649fea0172c,...,65700.0,bacabal,MA,22745.0,rio de janeiro,RJ,books_imported,2018-08-07,10:52:58,0


In [33]:
data['repeat?'].value_counts()

0    11475
1      268
Name: repeat?, dtype: int64

In [34]:
data.columns

Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date',
       'price', 'freight_value', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value', 'review_id', 'review_score',
       'review_comment_title', 'review_comment_message',
       'review_creation_date', 'review_answer_timestamp',
       'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'customer_unique_id', 'customer_zip_code_prefix', 'customer_city',
       'customer_state', 'seller_zip_code_prefix', 'seller_city',
       'seller_state', 'product_category_name_english', 'order_date',
       'order_time', 'repeat?'],
      