In [1]:
import warnings
warnings.simplefilter("ignore")

In [2]:
import pandas as pd
import numpy as np
import duckdb
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load data

In [4]:
df_customer = pd.read_csv('./dataset/olist_customers_dataset.csv')

In [5]:
df_geo = pd.read_csv('./dataset/olist_geolocation_dataset.csv')

In [6]:
df_order_item = pd.read_csv('./dataset/olist_order_items_dataset.csv')

In [7]:
df_order_payment = pd.read_csv('./dataset/olist_order_payments_dataset.csv')

In [8]:
df_order_review = pd.read_csv('./dataset/olist_order_reviews_dataset.csv')

In [9]:
df_order = pd.read_csv('./dataset/olist_orders_dataset.csv')

In [10]:
df_product = pd.read_csv('./dataset/olist_products_dataset.csv')

In [11]:
df_seller = pd.read_csv('./dataset/olist_sellers_dataset.csv')

In [12]:
df_cat_name = pd.read_csv('./dataset/product_category_name_translation.csv')

# 2. Feature engineering

1. customer_id

In [69]:
df_customer_unique_id = duckdb.query("""
select distinct customer_unique_id
from df_customer
""").to_df()

In [71]:
df_customer_unique_id

Unnamed: 0,customer_unique_id
0,345ecd01c38d18a9036ed96c73b8d066
1,9afe194fb833f79e300e37e580171f22
2,7f3a72e8f988c6e735ba118d54f47458
3,a8654e2af5da6bb72f52c22b164855e1
4,07d190f123147d9e89d4b922543d7948
...,...
96091,a662a415dbbb5a154b3d38aa4fc86e57
96092,592134387e7f13de6a9e38eb7d412b3d
96093,3c8d75c7b0acfda04e92d3dca43ccee7
96094,fb354969e06f2093c0083cbfbb91864e


2. number of sub account

In [73]:
df_customer_no_acc = duckdb.query("""
select customer_unique_id, count(customer_id) as no_account
from df_customer
group by customer_unique_id
order by no_account desc
""").to_df()

In [77]:
df_customer_no_acc

Unnamed: 0,customer_unique_id,no_account
0,8d50f5eadf50201ccdcedfb9e2ac8455,17
1,3e43e6105506432c953e165fb2acf44c,9
2,1b6c7548a2a1f9037c1fd3ddfed95f33,7
3,6469f99c1f9dfae7733b25662e7f1782,7
4,ca77025e7201e3b30c44b472ff346268,7
...,...,...
96091,b80c1a7b76e8104245be223374df5fd9,1
96092,3011394a1b8d1b04e425dd5dc77cbaaf,1
96093,046470763123d3d6364f89095b4e47ab,1
96094,15637b62dfa4c5a9df846b22beef0994,1


3. customer address

In [102]:
df_customer_address = duckdb.query("""
select customer_unique_id
, case when customer_state in ('AC', 'AP', 'AM', 'PA', 'RO', 'RR', 'TO') then 1 else 0 end north
, case when customer_state in ('AL', 'BA', 'CE', 'MA', 'PB', 'PE', 'PI', 'RN', 'SE') then 1 else 0 end northeast
, case when customer_state in ('DF', 'GO', 'MT', 'MS') then 1 else 0 end central_west
, case when customer_state in ('ES', 'MG', 'RJ', 'SP') then 1 else 0 end southeast
, case when customer_state in ('PR', 'RS', 'SC') then 1 else 0 end south
from
    (
    select customer_unique_id, customer_state, customer_city, customer_zip_code_prefix
    , row_number() over(partition by customer_unique_id order by customer_zip_code_prefix) rn
    from df_customer
    ) a
where rn = 1
""").to_df()

In [104]:
df_customer_address

Unnamed: 0,customer_unique_id,north,northeast,central_west,southeast,south
0,0004bd2a26a76fe21f786e4fbd80607f,0,0,0,1,0
1,00090324bbad0e9342388303bb71ba0a,0,0,0,1,0
2,000d460961d6dbfa3ec6c9f5805769e1,0,0,0,1,0
3,0028a7d8db7b0247652509358ad8d755,0,0,0,1,0
4,002d71b244beb91ca7030b15ab526446,0,0,0,1,0
...,...,...,...,...,...,...
96091,ffddf4e5baa1623f69d3c5e0d775e1af,0,0,0,1,0
96092,ffdffea8a4b04f14987bfef2a2287fa9,0,0,0,1,0
96093,ffe9be10b9a58c5464d833e8b1b2c632,0,0,0,1,0
96094,fff5eb4918b2bf4b2da476788d42051c,0,1,0,0,0


4. Frequency

In [166]:
duckdb.query("""
select b.customer_unique_id
, count(distinct order_id) no_order
, 2019 - min(year(cast(order_purchase_timestamp as datetime))) year_start_purchase
, count(distinct order_id)/
(2019 - min(year(cast(order_purchase_timestamp as datetime)))) no_order_per_year
from df_order a
left join df_customer b
on a.customer_id = b.customer_id
where a.order_status not in ('unavailable','canceled')
and a.order_purchase_timestamp is not null
group by b.customer_unique_id
""").to_df()

Unnamed: 0,customer_unique_id,no_order,year_start_purchase,no_order_per_year
0,345ecd01c38d18a9036ed96c73b8d066,1,1,1.0
1,9afe194fb833f79e300e37e580171f22,1,1,1.0
2,7f3a72e8f988c6e735ba118d54f47458,1,2,0.5
3,07d190f123147d9e89d4b922543d7948,1,1,1.0
4,c9158d089637ab443c78984d20da7fc0,2,1,2.0
...,...,...,...,...
94985,d385d86ff15a721b213924f5f6b8316f,1,1,1.0
94986,919decef1f13a9b93ed6cca4af6a8c2f,1,2,0.5
94987,591373b45f940c1da643522fa4300753,1,2,0.5
94988,6ff6f2913b97c58e8c0a4c96d8de6158,1,1,1.0


In [106]:
**

SyntaxError: invalid syntax (402238255.py, line 1)

# 3. Join data

In [108]:
duckdb.query("""
select a.customer_unique_id
, b.no_account
, c.north, c.northeast, c.central_west, c.southeast, c.south

from df_customer_unique_id a
left join df_customer_no_acc b
on a.customer_unique_id = b.customer_unique_id
left join df_customer_address c
on a.customer_unique_id = c.customer_unique_id
""").to_df()

Unnamed: 0,customer_unique_id,no_account,north,northeast,central_west,southeast,south
0,a8654e2af5da6bb72f52c22b164855e1,1,0,0,0,1,0
1,07d190f123147d9e89d4b922543d7948,1,0,1,0,0,0
2,c9158d089637ab443c78984d20da7fc0,2,0,0,0,1,0
3,8d46223c91cbeb93e0930ca8bd8ffca2,1,0,0,0,0,1
4,d33eeadf54cb883e79be640f38c32cdc,1,0,0,0,1,0
...,...,...,...,...,...,...,...
96091,71d5b68149301ba41f0a5445f990e742,1,0,0,0,1,0
96092,bc54474f8433c845bc7146b9b14e584e,1,0,0,1,0,0
96093,c9f9c202fad4743c180a0c0f6a5446c5,1,0,0,0,1,0
96094,9fe919ec86e46a294e9deb124a2be310,1,0,0,0,0,1


In [195]:
duckdb.query("""
select b.customer_unique_id
, count(distinct a.order_id) no_order
, 2019 - min(year(cast(order_purchase_timestamp as datetime))) year_start_purchase
, count(distinct a.order_id)/
(2019 - min(year(cast(order_purchase_timestamp as datetime)))) no_order_per_year
, sum(c.price) amt
, sum(c.price)/count(distinct a.order_id)ticket_size
, sum(c.price)/
(2019 - min(year(cast(order_purchase_timestamp as datetime)))) amt_per_year
from df_order a
left join df_customer b
on a.customer_id = b.customer_id
left join df_order_item c
on a.order_id = c.order_id
where a.order_status not in ('unavailable','canceled')
and a.order_purchase_timestamp is not null
group by b.customer_unique_id
""").to_df()

Unnamed: 0,customer_unique_id,no_order,year_start_purchase,no_order_per_year,amt,ticket_size,amt_per_year
0,2a7745e1ed516b289ed9b29c7d0539a5,1,2,0.5,25.30,25.30,12.650
1,918dc87cd72cd9f6ed4bd442ed785235,1,2,0.5,99.00,99.00,49.500
2,bf4862777db128507e9efcc789215e9b,1,2,0.5,49.00,49.00,24.500
3,7556f182460418cf30957e6ce377c674,1,1,1.0,42.99,42.99,42.990
4,fd2d5fdb84e65fa6b54b98b0e2df5645,1,2,0.5,44.90,44.90,22.450
...,...,...,...,...,...,...,...
94985,6bffc0c27ac43787c33fd71822c6c58a,1,1,1.0,550.00,550.00,550.000
94986,93b298f93c123a14108dd43838ccbc8c,1,2,0.5,139.99,139.99,69.995
94987,2302114645331312215fffbe07aab78a,1,1,1.0,62.00,62.00,62.000
94988,bf8c6803e3c50a18790e976401dd45a0,1,2,0.5,127.90,127.90,63.950


In [None]:
seet *
from df_order a
left join df_customer b
on a.customer_id = b.customer_id
where a.order_status not in ('unavailable','canceled')
and a.order_purchase_timestamp is not null
group by b.customer_unique_id

In [None]:
duckdb.query("""
select *
from df_order_payment
""").to_df()