In [None]:
import warnings
warnings.simplefilter("ignore")

In [None]:
import pandas as pd
import numpy as np
import duckdb
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load data

olist_customers_dataset

In [None]:
df_customer = pd.read_csv('./dataset/olist_customers_dataset.csv')

In [None]:
df_customer.head()

In [None]:
df_customer.info()

olist_geolocation_dataset

In [None]:
df_geo = pd.read_csv('./dataset/olist_geolocation_dataset.csv')

In [None]:
df_geo.head()

In [None]:
df_geo.info()

olist_order_items_dataset

In [None]:
df_order_item = pd.read_csv('./dataset/olist_order_items_dataset.csv')

In [None]:
df_order_item.head()

In [None]:
df_order_item.info()

In [None]:
df_order_item['shipping_limit_date'] = pd.to_datetime(df_order_item['shipping_limit_date'])

olist_order_payments_dataset

In [None]:
df_order_payment = pd.read_csv('./dataset/olist_order_payments_dataset.csv')

In [None]:
df_order_payment.head()

In [None]:
df_order_payment.info()

olist_order_reviews_dataset

In [None]:
df_order_review = pd.read_csv('./dataset/olist_order_reviews_dataset.csv')

In [None]:
df_order_review.head()

In [None]:
df_order_review.info()

In [None]:
df_order_review['review_creation_date'] = pd.to_datetime(df_order_review['review_creation_date'])
df_order_review['review_answer_timestamp'] = pd.to_datetime(df_order_review['review_answer_timestamp'])

olist_orders_dataset

In [None]:
df_order = pd.read_csv('./dataset/olist_orders_dataset.csv')

In [None]:
df_order.head()

In [None]:
df_order.info()

In [None]:
df_order['order_purchase_timestamp'] = pd.to_datetime(df_order['order_purchase_timestamp'])
df_order['order_approved_at'] = pd.to_datetime(df_order['order_approved_at'])
df_order['order_delivered_carrier_date'] = pd.to_datetime(df_order['order_delivered_carrier_date'])
df_order['order_delivered_customer_date'] = pd.to_datetime(df_order['order_delivered_customer_date'])
df_order['order_estimated_delivery_date'] = pd.to_datetime(df_order['order_estimated_delivery_date'])

olist_products_dataset

In [None]:
df_product = pd.read_csv('./dataset/olist_products_dataset.csv')

In [None]:
df_product.head()

In [None]:
df_product.info()

olist_sellers_dataset

In [None]:
df_seller = pd.read_csv('./dataset/olist_sellers_dataset.csv')

In [None]:
df_seller.head()

In [None]:
df_seller.info()

product_category_name_translation

In [None]:
df_cat_name = pd.read_csv('./dataset/product_category_name_translation.csv')

In [None]:
df_cat_name.head()

In [None]:
df_cat_name.info()

olist_age

In [None]:
df_age = pd.read_csv('./dataset/olist_age.csv')

In [None]:
df_age.head()

In [None]:
df_age.info()

# 2. Feature engineering

1. customer_id

In [None]:
df_customer_unique_id = duckdb.query("""
select distinct customer_unique_id
from df_customer
""").to_df()

In [None]:
df_customer_unique_id

2. number of sub account

In [None]:
df_customer_no_acc = duckdb.query("""
select customer_unique_id, count(customer_id) as no_account
from df_customer
group by customer_unique_id
order by no_account desc
""").to_df()

In [None]:
df_customer_no_acc

3. age

In [None]:
df_customer_age = duckdb.query("""
select b.customer_unique_id
, min(a.age) age
from df_age a
left join df_customer b
on a.customer_id = b.customer_id
group by b.customer_unique_id
""").to_df()

In [None]:
df_customer_age

4. customer address

In [None]:
df_customer_address = duckdb.query("""
select customer_unique_id
, case when customer_state in ('AC', 'AP', 'AM', 'PA', 'RO', 'RR', 'TO') then 1 else 0 end north
, case when customer_state in ('AL', 'BA', 'CE', 'MA', 'PB', 'PE', 'PI', 'RN', 'SE') then 1 else 0 end northeast
, case when customer_state in ('DF', 'GO', 'MT', 'MS') then 1 else 0 end central_west
, case when customer_state in ('ES', 'MG', 'RJ', 'SP') then 1 else 0 end southeast
, case when customer_state in ('PR', 'RS', 'SC') then 1 else 0 end south
from
    (
    select customer_unique_id, customer_state, customer_city, customer_zip_code_prefix
    , row_number() over(partition by customer_unique_id order by customer_zip_code_prefix) rn
    from df_customer
    ) a
where rn = 1
""").to_df()

In [None]:
df_customer_address

5. recency

In [None]:
df_customer_recency = duckdb.query("""
select b.customer_unique_id
, max(cast(order_purchase_timestamp as datetime)) lastest_txn
, min(datediff('day', cast(order_purchase_timestamp as datetime), cast('2018-11-01' as datetime))) as day_after_last_pur
from df_order a
left join df_customer b on a.customer_id = b.customer_id
where a.order_status not in ('unavailable','canceled')
and a.order_purchase_timestamp is not null
group by b.customer_unique_id
""").to_df()

In [None]:
df_customer_recency

6. frequency

In [None]:
duckdb.query("""
select max(cast(order_purchase_timestamp as datetime)) max_date from df_order
""").to_df()

In [None]:
df_customer_frequency = duckdb.query("""
select b.customer_unique_id
, count(distinct order_id) no_order
, max(datediff('month',cast(order_purchase_timestamp as datetime),cast('2018-11-01' as datetime))) account_age_month
, count(distinct order_id)/
min(datediff('month',cast(order_purchase_timestamp as datetime),cast('2018-11-01' as datetime))) order_per_month
from df_order a
left join df_customer b
on a.customer_id = b.customer_id
where a.order_status not in ('unavailable','canceled')
and a.order_purchase_timestamp is not null
group by b.customer_unique_id
""").to_df()

In [None]:
df_customer_frequency

7. monetary

In [None]:
df_customer_monetary = duckdb.query("""
select b.customer_unique_id
, count(distinct a.order_id) no_order
, sum(c.price) amount             
, max(datediff('month',cast(order_purchase_timestamp as datetime),cast('2018-11-01' as datetime))) account_age_month         
-- , sum(c.price)/count(distinct a.order_id) amount_per_order
, sum(c.price)/
max(datediff('month',cast(order_purchase_timestamp as datetime),cast('2018-11-01' as datetime))) amount_per_month
from df_order a
left join df_customer b
on a.customer_id = b.customer_id
left join df_order_item c
on a.order_id = c.order_id
where a.order_status not in ('unavailable','canceled')
and a.order_purchase_timestamp is not null
group by b.customer_unique_id
""").to_df()

In [None]:
df_customer_monetary

# 3. Join data

In [None]:
df = duckdb.query("""
select a.customer_unique_id
, b.no_account
, c.age
, d.north, d.northeast, d.central_west, d.southeast, d.south
, e.day_after_last_pur
, f.order_per_month
-- , g.amount_per_order
, g.amount_per_month
from df_customer_unique_id a
left join df_customer_no_acc b
on a.customer_unique_id = b.customer_unique_id
left join df_customer_age c
on a.customer_unique_id = c.customer_unique_id
left join df_customer_address d
on a.customer_unique_id = d.customer_unique_id
left join df_customer_recency e
on a.customer_unique_id = e.customer_unique_id
left join df_customer_frequency f
on a.customer_unique_id = f.customer_unique_id
left join df_customer_monetary g
on a.customer_unique_id = g.customer_unique_id
""").to_df()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df['day_after_last_pur'].fillna(0, inplace=True)
df['order_per_month'].fillna(0, inplace=True)
# df['amount_per_order'].fillna(0, inplace=True)
df['amount_per_month'].fillna(0, inplace=True)

In [None]:
df.info()

In [None]:
df.dropna(inplace=True)

In [None]:
df

# 4. Exploratory data analysis (EDA)

list of columns by types

In [None]:
df.columns.tolist()

In [None]:
id_col = ['customer_unique_id']

In [None]:
cat_col = ['north','northeast','central_west','southeast','south']

In [None]:
num_col = ['no_account','age','day_after_last_pur','order_per_month','amount_per_month']
# num_col = ['no_account','age','day_after_last_pur','order_per_month','amount_per_order','amount_per_month']

categorical features

In [None]:
df[cat_col].describe()

numeric feature

In [None]:
for column_name in num_col:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    ax1.hist(df[column_name], bins=10, edgecolor='black', alpha=0.7)
    ax1.set_title(f'Histogram of {column_name}')
    ax1.set_xlabel(column_name)
    ax1.set_ylabel('Frequency')
    ax1.grid(axis='y', alpha=0.75)
    
    if isinstance(df[column_name].dtype, pd.CategoricalDtype):
        sns.boxplot(x=df[column_name], ax=ax2)
    else:
        sns.boxplot(x=df[column_name], ax=ax2)
    ax2.set_title(f'Boxplot of {column_name} (No Hue)')
    ax2.set_xlabel(column_name)
    
    plt.show()

drop unnecessary columns

In [None]:
df.drop(columns = 'customer_unique_id', inplace = True)

correlation

In [None]:
corr = df.corr()

In [None]:
fig, ax = plt.subplots(figsize=(20, 5))
sns.heatmap(corr, annot=True, ax=ax)

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler


normalized

In [None]:
scaler = MinMaxScaler()

In [None]:
df = df[['order_per_month','amount_per_month','day_after_last_pur']]

In [None]:
scaler.fit(df)

In [None]:
df_scaled = scaler.transform(df)

In [None]:
from sklearn.cluster import DBSCAN


In [None]:
# Initialize DBSCAN
# You can adjust the eps (maximum distance between two samples) and min_samples parameters
# depending on your dataset
eps = 0.5  # Adjust as needed
min_samples = 5  # Adjust as needed
dbscan = DBSCAN(eps=eps, min_samples=min_samples)

# Fit the model
dbscan.fit(df_scaled)

# Get the cluster labels (-1 indicates noise points)
labels = dbscan.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)

In [None]:
print('Estimated number of clusters: %d' % n_clusters_)

In [None]:
labels

In [None]:
df['cluster'] = labels

In [None]:
df

In [None]:
df.groupby('cluster').mean()