In [1]:
import warnings
warnings.simplefilter("ignore")

In [2]:
import pandas as pd
import numpy as np
import duckdb
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load data

In [3]:
df_customer = pd.read_csv('./dataset/olist_customers_dataset.csv')

In [4]:
df_geo = pd.read_csv('./dataset/olist_geolocation_dataset.csv')

In [5]:
df_order_item = pd.read_csv('./dataset/olist_order_items_dataset.csv')

In [6]:
df_order_payment = pd.read_csv('./dataset/olist_order_payments_dataset.csv')

In [7]:
df_order_review = pd.read_csv('./dataset/olist_order_reviews_dataset.csv')

In [8]:
df_order = pd.read_csv('./dataset/olist_orders_dataset.csv')

In [9]:
df_product = pd.read_csv('./dataset/olist_products_dataset.csv')

In [10]:
df_seller = pd.read_csv('./dataset/olist_sellers_dataset.csv')

In [11]:
df_cat_name = pd.read_csv('./dataset/product_category_name_translation.csv')

In [12]:
df_age = pd.read_csv('./dataset/olist_age.csv')

# 2. Check data quality

In [13]:
def empty_or_whitespace_strings(df):
    string_columns = df.select_dtypes(include=['object']).columns
    empty_whitespace_check = df[string_columns].applymap(lambda x: isinstance(x, str) and x.isspace())

    count_empty_whitespace = empty_whitespace_check.sum()
    rows_with_whitespace_indices = empty_whitespace_check.any(axis=1)
    list_row_with_whilespace_indices = df.index[rows_with_whitespace_indices].tolist()

    return count_empty_whitespace, list_row_with_whilespace_indices

olist_customers_dataset

In [14]:
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


In [15]:
df_customer.isna().sum()

customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64

In [16]:
for i in df_customer.columns:
    print('Columns name: ', i)
    print('Unique value: ', df_customer[i].unique())
    print('Count unique value: ', df_customer[i].nunique())
    print('-'*10)

Columns name:  customer_id
Unique value:  ['06b8999e2fba1a1fbc88172c00ba8bc7' '18955e83d337fd6b2def6b18a428ac77'
 '4e7b3e00288586ebd08712fdd0374a03' ... '5e28dfe12db7fb50a4b2f691faecea5e'
 '56b18e2166679b8a959d72dd06da27f9' '274fa6071e5e17fe303b9748641082c8']
Count unique value:  99441
----------
Columns name:  customer_unique_id
Unique value:  ['861eff4711a542e4b93843c6dd7febb0' '290c77bc529b7ac935b93aa66c333dc3'
 '060e732b5b29e8181a18229c7b0b2b5e' ... 'e9f50caf99f032f0bf3c55141f019d99'
 '73c2643a0a458b49f58cea58833b192e' '84732c5050c01db9b23e19ba39899398']
Count unique value:  96096
----------
Columns name:  customer_zip_code_prefix
Unique value:  [14409  9790  1151 ...  5538 74980 99043]
Count unique value:  14994
----------
Columns name:  customer_city
Unique value:  ['franca' 'sao bernardo do campo' 'sao paulo' ... 'monte bonito'
 'sao rafael' 'eugenio de castro']
Count unique value:  4119
----------
Columns name:  customer_state
Unique value:  ['SP' 'SC' 'MG' 'PR' 'RJ' 'RS' 'PA' 

In [17]:
empty_or_whitespace_strings(df_customer)[0]

customer_id           0
customer_unique_id    0
customer_city         0
customer_state        0
dtype: int64

olist_geolocation_dataset

In [18]:
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB


In [19]:
df_geo.isna().sum()

geolocation_zip_code_prefix    0
geolocation_lat                0
geolocation_lng                0
geolocation_city               0
geolocation_state              0
dtype: int64

In [20]:
for i in df_geo.columns:
    print('Columns name: ', i)
    print('Unique value: ', df_geo[i].unique())
    print('Count unique value: ', df_geo[i].nunique())
    print('-'*10)

Columns name:  geolocation_zip_code_prefix
Unique value:  [ 1037  1046  1041 ... 99910 99920 99952]
Count unique value:  19015
----------
Columns name:  geolocation_lat
Unique value:  [-23.54562128 -23.54608113 -23.54612897 ... -28.06886363 -28.06863888
 -28.38893188]
Count unique value:  717360
----------
Columns name:  geolocation_lng
Unique value:  [-46.63929205 -46.6448203  -46.64295148 ... -52.01296438 -52.01070525
 -51.84687132]
Count unique value:  717613
----------
Columns name:  geolocation_city
Unique value:  ['sao paulo' 'são paulo' 'sao bernardo do campo' ... 'ciríaco' 'estação'
 'vila lângaro']
Count unique value:  8011
----------
Columns name:  geolocation_state
Unique value:  ['SP' 'RN' 'AC' 'RJ' 'ES' 'MG' 'BA' 'SE' 'PE' 'AL' 'PB' 'CE' 'PI' 'MA'
 'PA' 'AP' 'AM' 'RR' 'DF' 'GO' 'RO' 'TO' 'MT' 'MS' 'RS' 'PR' 'SC']
Count unique value:  27
----------


In [21]:
empty_or_whitespace_strings(df_geo)[0]

geolocation_city     0
geolocation_state    0
dtype: int64

olist_order_items_dataset

In [22]:
df_order_item.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   order_id             112650 non-null  object 
 1   order_item_id        112650 non-null  int64  
 2   product_id           112650 non-null  object 
 3   seller_id            112650 non-null  object 
 4   shipping_limit_date  112650 non-null  object 
 5   price                112650 non-null  float64
 6   freight_value        112650 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.0+ MB


In [23]:
df_order_item.isna().sum()

order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64

In [24]:
for i in df_order_item.columns:
    print('Columns name: ', i)
    print('Unique value: ', df_order_item[i].unique())
    print('Count unique value: ', df_order_item[i].nunique())
    print('-'*10)

Columns name:  order_id
Unique value:  ['00010242fe8c5a6d1ba2dd792cb16214' '00018f77f2f0320c557190d7a144bdd3'
 '000229ec398224ef6ca0657da4fc703e' ... 'fffce4705a9662cd70adb13d4a31832d'
 'fffe18544ffabc95dfada21779c9644f' 'fffe41c64501cc87c801fd61db3f6244']
Count unique value:  98666
----------
Columns name:  order_item_id
Unique value:  [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21]
Count unique value:  21
----------
Columns name:  product_id
Unique value:  ['4244733e06e7ecb4970a6e2683c13e61' 'e5f2d52b802189ee658865ca93d83a8f'
 'c777355d18b72b67abbeef9df44fd0fd' ... 'dd469c03ad67e201bc2179ef077dcd48'
 'bbe7651fef80287a816ead73f065fc4b' '350688d9dc1e75ff97be326363655e01']
Count unique value:  32951
----------
Columns name:  seller_id
Unique value:  ['48436dade18ac8b2bce089ec2a041202' 'dd7ddc04e1b6c2c614352b383efe2d36'
 '5b51032eddd242adc84c38acab88f23d' ... '3fefda3299e6dfaea3466ef346a3571a'
 '80ceebb4ee9b31afb6c6a916a574a1e2' '3e35a8bb43569389d3cebef0ce820f69']
Count 

In [25]:
empty_or_whitespace_strings(df_order_item)[0]

order_id               0
product_id             0
seller_id              0
shipping_limit_date    0
dtype: int64

olist_order_payments_dataset

In [26]:
df_order_payment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


In [27]:
df_order_payment.isna().sum()

order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

In [28]:
for i in df_order_payment.columns:
    print('Columns name: ', i)
    print('Unique value: ', df_order_payment[i].unique())
    print('Count unique value: ', df_order_payment[i].nunique())
    print('-'*10)

Columns name:  order_id
Unique value:  ['b81ef226f3fe1789b1e8b2acac839d17' 'a9810da82917af2d9aefd1278f1dcfa0'
 '25e8ea4e93396b6fa0d3dd708e76c1bd' ... '32609bbb3dd69b3c066a6860554a77bf'
 'b8b61059626efa996a60be9bb9320e10' '28bbae6599b09d39ca406b747b6632b1']
Count unique value:  99440
----------
Columns name:  payment_sequential
Unique value:  [ 1  2  4  5  3  8  6  7 10 11 17 19 27 12  9 15 13 14 16 25 22 26 29 28
 18 21 24 23 20]
Count unique value:  29
----------
Columns name:  payment_type
Unique value:  ['credit_card' 'boleto' 'voucher' 'debit_card' 'not_defined']
Count unique value:  5
----------
Columns name:  payment_installments
Unique value:  [ 8  1  2  3  6  5  4 10  7 12  9 13 15 24 11 18 14 20 21 17 22  0 16 23]
Count unique value:  24
----------
Columns name:  payment_value
Unique value:  [ 99.33  24.39  65.71 ... 205.71 100.55 363.31]
Count unique value:  29077
----------


In [29]:
empty_or_whitespace_strings(df_order_payment)[0]

order_id        0
payment_type    0
dtype: int64

olist_order_reviews_dataset

In [30]:
df_order_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   review_id                99224 non-null  object
 1   order_id                 99224 non-null  object
 2   review_score             99224 non-null  int64 
 3   review_comment_title     11568 non-null  object
 4   review_comment_message   40977 non-null  object
 5   review_creation_date     99224 non-null  object
 6   review_answer_timestamp  99224 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB


In [31]:
df_order_review.isna().sum()

review_id                      0
order_id                       0
review_score                   0
review_comment_title       87656
review_comment_message     58247
review_creation_date           0
review_answer_timestamp        0
dtype: int64

In [32]:
for i in df_order_review.columns:
    print('Columns name: ', i)
    print('Unique value: ', df_order_review[i].unique())
    print('Count unique value: ', df_order_review[i].nunique())
    print('-'*10)

Columns name:  review_id
Unique value:  ['7bc2406110b926393aa56f80a40eba40' '80e641a11e56f04c1ad469d5645fdfde'
 '228ce5500dc1d8e020d8d1322874b6f0' ... 'b3de70c89b1510c4cd3d0649fd302472'
 '1adeb9d84d72fe4e337617733eb85149' 'efe49f1d6f951dd88b51e6ccd4cc548f']
Count unique value:  98410
----------
Columns name:  order_id
Unique value:  ['73fc7af87114b39712e6da79b0a377eb' 'a548910a1c6147796b98fdf73dbeba33'
 'f9e4b658b201a9f2ecdecbb34bed034b' ... '55d4004744368f5571d1f590031933e4'
 '7725825d039fc1f0ceb7635e3f7d9206' '90531360ecb1eec2a1fbb265a0db0508']
Count unique value:  98673
----------
Columns name:  review_score
Unique value:  [4 5 1 3 2]
Count unique value:  5
----------
Columns name:  review_comment_title
Unique value:  [nan 'recomendo' 'Super recomendo' ... 'Não foi entregue o pedido'
 'OTIMA EMBALAGEM' 'Foto enganosa ']
Count unique value:  4527
----------
Columns name:  review_comment_message
Unique value:  [nan 'Recebi bem antes do prazo estipulado.'
 'Parabéns lojas lannister ado

In [33]:
empty_or_whitespace_strings(df_order_review)[0]

review_id                   0
order_id                    0
review_comment_title        2
review_comment_message     27
review_creation_date        0
review_answer_timestamp     0
dtype: int64

olist_orders_dataset

In [34]:
df_order.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   order_id                       99441 non-null  object
 1   customer_id                    99441 non-null  object
 2   order_status                   99441 non-null  object
 3   order_purchase_timestamp       99441 non-null  object
 4   order_approved_at              99281 non-null  object
 5   order_delivered_carrier_date   97658 non-null  object
 6   order_delivered_customer_date  96476 non-null  object
 7   order_estimated_delivery_date  99441 non-null  object
dtypes: object(8)
memory usage: 6.1+ MB


In [35]:
df_order.isna().sum()

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

In [36]:
for i in df_order.columns:
    print('Columns name: ', i)
    print('Unique value: ', df_order[i].unique())
    print('Count unique value: ', df_order[i].nunique())
    print('-'*10)

Columns name:  order_id
Unique value:  ['e481f51cbdc54678b7cc49136f2d6af7' '53cdb2fc8bc7dce0b6741e2150273451'
 '47770eb9100c2d0c44946d9cf07ec65d' ... '83c1379a015df1e13d02aae0204711ab'
 '11c177c8e97725db2631073c19f07b62' '66dea50a8b16d9b4dee7af250b4be1a5']
Count unique value:  99441
----------
Columns name:  customer_id
Unique value:  ['9ef432eb6251297304e76186b10a928d' 'b0830fb4747a6c6d20dea0b8c802d7ef'
 '41ce2a54c0b03bf3443c3d931a367089' ... '1aa71eb042121263aafbe80c1b562c9c'
 'b331b74b18dc79bcdf6532d51e1637c1' 'edb027a75a1449115f6b43211ae02a24']
Count unique value:  99441
----------
Columns name:  order_status
Unique value:  ['delivered' 'invoiced' 'shipped' 'processing' 'unavailable' 'canceled'
 'created' 'approved']
Count unique value:  8
----------
Columns name:  order_purchase_timestamp
Unique value:  ['2017-10-02 10:56:33' '2018-07-24 20:41:37' '2018-08-08 08:38:49' ...
 '2017-08-27 14:46:43' '2018-01-08 21:28:27' '2018-03-08 20:57:30']
Count unique value:  98875
----------
Col

In [37]:
empty_or_whitespace_strings(df_order)[0]

order_id                         0
customer_id                      0
order_status                     0
order_purchase_timestamp         0
order_approved_at                0
order_delivered_carrier_date     0
order_delivered_customer_date    0
order_estimated_delivery_date    0
dtype: int64

olist_products_dataset

In [38]:
df_product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB


In [39]:
df_product.isna().sum()

product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64

In [40]:
for i in df_product.columns:
    print('Columns name: ', i)
    print('Unique value: ', df_product[i].unique())
    print('Count unique value: ', df_product[i].nunique())
    print('-'*10)

Columns name:  product_id
Unique value:  ['1e9e8ef04dbcff4541ed26657ea517e5' '3aa071139cb16b67ca9e5dea641aaa2f'
 '96bd76ec8810374ed1b65e291975717f' ... '9a7c6041fa9592d9d9ef6cfe62a71f8c'
 '83808703fc0706a22e264b9d75f04a2e' '106392145fca363410d287a815be6de4']
Count unique value:  32951
----------
Columns name:  product_category_name
Unique value:  ['perfumaria' 'artes' 'esporte_lazer' 'bebes' 'utilidades_domesticas'
 'instrumentos_musicais' 'cool_stuff' 'moveis_decoracao'
 'eletrodomesticos' 'brinquedos' 'cama_mesa_banho'
 'construcao_ferramentas_seguranca' 'informatica_acessorios'
 'beleza_saude' 'malas_acessorios' 'ferramentas_jardim'
 'moveis_escritorio' 'automotivo' 'eletronicos' 'fashion_calcados'
 'telefonia' 'papelaria' 'fashion_bolsas_e_acessorios' 'pcs'
 'casa_construcao' 'relogios_presentes'
 'construcao_ferramentas_construcao' 'pet_shop' 'eletroportateis'
 'agro_industria_e_comercio' nan 'moveis_sala' 'sinalizacao_e_seguranca'
 'climatizacao' 'consoles_games' 'livros_interess

In [41]:
empty_or_whitespace_strings(df_product)[0]

product_id               0
product_category_name    0
dtype: int64

olist_sellers_dataset

In [42]:
df_seller.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB


In [43]:
df_seller.isna().sum()

seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64

In [44]:
for i in df_seller.columns:
    print('Columns name: ', i)
    print('Unique value: ', df_seller[i].unique())
    print('Count unique value: ', df_seller[i].nunique())
    print('-'*10)

Columns name:  seller_id
Unique value:  ['3442f8959a84dea7ee197c632cb2df15' 'd1b65fc7debc3361ea86b5f14c68d2e2'
 'ce3ad9de960102d0677a81f5d0bb7b2d' ... '74871d19219c7d518d0090283e03c137'
 'e603cf3fec55f8697c9059638d6c8eb5' '9e25199f6ef7e7c347120ff175652c3b']
Count unique value:  3095
----------
Columns name:  seller_zip_code_prefix
Unique value:  [13023 13844 20031 ...  4650 96080 12051]
Count unique value:  2246
----------
Columns name:  seller_city
Unique value:  ['campinas' 'mogi guacu' 'rio de janeiro' 'sao paulo' 'braganca paulista'
 'brejao' 'penapolis' 'curitiba' 'anapolis' 'itirapina' 'tubarao'
 'lauro de freitas' 'imbituba' 'brasilia' 'porto seguro' 'guaruja'
 'tabatinga' 'salto' 'tres de maio' 'belo horizonte' 'arapongas'
 'sao bernardo do campo' 'tatui' 'garopaba' 'camanducaia' 'tupa'
 'guarulhos' 'sao jose dos pinhais' 'sao ludgero' 'sao jose' 'piracicaba'
 'porto alegre' 'congonhal' 'santo andre' 'osasco' 'valinhos' 'joinville'
 'saquarema' 'barra velha' 'petropolis' 'santo

In [45]:
empty_or_whitespace_strings(df_seller)[0]

seller_id       0
seller_city     0
seller_state    0
dtype: int64

product_category_name_translation

In [46]:
df_cat_name.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 2 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   product_category_name          71 non-null     object
 1   product_category_name_english  71 non-null     object
dtypes: object(2)
memory usage: 1.2+ KB


In [47]:
df_cat_name.isna().sum()

product_category_name            0
product_category_name_english    0
dtype: int64

In [48]:
for i in df_cat_name.columns:
    print('Columns name: ', i)
    print('Unique value: ', df_cat_name[i].unique())
    print('Count unique value: ', df_cat_name[i].nunique())
    print('-'*10)

Columns name:  product_category_name
Unique value:  ['beleza_saude' 'informatica_acessorios' 'automotivo' 'cama_mesa_banho'
 'moveis_decoracao' 'esporte_lazer' 'perfumaria' 'utilidades_domesticas'
 'telefonia' 'relogios_presentes' 'alimentos_bebidas' 'bebes' 'papelaria'
 'tablets_impressao_imagem' 'brinquedos' 'telefonia_fixa'
 'ferramentas_jardim' 'fashion_bolsas_e_acessorios' 'eletroportateis'
 'consoles_games' 'audio' 'fashion_calcados' 'cool_stuff'
 'malas_acessorios' 'climatizacao' 'construcao_ferramentas_construcao'
 'moveis_cozinha_area_de_servico_jantar_e_jardim'
 'construcao_ferramentas_jardim' 'fashion_roupa_masculina' 'pet_shop'
 'moveis_escritorio' 'market_place' 'eletronicos' 'eletrodomesticos'
 'artigos_de_festas' 'casa_conforto' 'construcao_ferramentas_ferramentas'
 'agro_industria_e_comercio' 'moveis_colchao_e_estofado' 'livros_tecnicos'
 'casa_construcao' 'instrumentos_musicais' 'moveis_sala'
 'construcao_ferramentas_iluminacao' 'industria_comercio_e_negocios'
 'alimen

In [49]:
empty_or_whitespace_strings(df_cat_name)[0]

product_category_name            0
product_category_name_english    0
dtype: int64

olist_age

In [50]:
df_age.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98666 entries, 0 to 98665
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  98666 non-null  object
 1   age          98666 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [51]:
df_age.isna().sum()

customer_id    0
age            0
dtype: int64

In [52]:
for i in df_age.columns:
    print('Columns name: ', i)
    print('Unique value: ', df_age[i].unique())
    print('Count unique value: ', df_age[i].nunique())
    print('-'*10)

Columns name:  customer_id
Unique value:  ['00012a2ce6f8dcda20d059ce98491703' '000161a058600d5901f007fab4c27140'
 '0001fd6190edaaf884bcaf3d49edf079' ... 'fffeda5b6d849fbd39689bb92087f431'
 'ffff42319e9b2d713724ae527742af25' 'ffffa3172527f765de70084a7e53aae8']
Count unique value:  98666
----------
Columns name:  age
Unique value:  [61 49 71 68 42 63 40 69 18 39 51 50 21 59 62 22 43 27 28 75 64 67 72 20
 54 66 73 36 65 41 25 46 53 33 47 23 57 38 56 60 70 35 24 30 52 37 31 29
 44 55 58 74 34 19 26 45 32 48]
Count unique value:  58
----------


In [53]:
empty_or_whitespace_strings(df_age)[0]

customer_id    0
dtype: int64

# 3. Exploratory data analysis (EDA)

olist_customers_dataset

In [54]:
duckdb.query("""
select *
from df_customer
""").to_df()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP
...,...,...,...,...,...
99436,17ddf5dd5d51696bb3d7c6291687be6f,1a29b476fee25c95fbafc67c5ac95cf8,3937,sao paulo,SP
99437,e7b71a9017aa05c9a7fd292d714858e8,d52a67c98be1cf6a5c84435bd38d095d,6764,taboao da serra,SP
99438,5e28dfe12db7fb50a4b2f691faecea5e,e9f50caf99f032f0bf3c55141f019d99,60115,fortaleza,CE
99439,56b18e2166679b8a959d72dd06da27f9,73c2643a0a458b49f58cea58833b192e,92120,canoas,RS


In [55]:
duckdb.query("""
select count(distinct customer_unique_id) no_customer
from df_customer
""").to_df()

Unnamed: 0,no_customer
0,96096


In [56]:
duckdb.query("""
select customer_unique_id, count(customer_id) no_sub
from df_customer
group by customer_unique_id
order by no_sub desc
""").to_df()

Unnamed: 0,customer_unique_id,no_sub
0,8d50f5eadf50201ccdcedfb9e2ac8455,17
1,3e43e6105506432c953e165fb2acf44c,9
2,ca77025e7201e3b30c44b472ff346268,7
3,1b6c7548a2a1f9037c1fd3ddfed95f33,7
4,6469f99c1f9dfae7733b25662e7f1782,7
...,...,...
96091,206e64e8af2633a2ebe158a7fcb860db,1
96092,4452b8ef472646c4cc042cb31a291f3b,1
96093,57c2cfb4a80b13ed19b5fb258d29c19d,1
96094,ca186065de6e2d01cfc99763e6d62048,1


In [57]:
duckdb.query("""
select count(distinct customer_city) no_city
from df_customer
""").to_df()

Unnamed: 0,no_city
0,4119


In [58]:
duckdb.query("""
select count(distinct customer_zip_code_prefix) no_zip
from df_customer
""").to_df()

Unnamed: 0,no_zip
0,14994


olist_geolocation_dataset

In [59]:
duckdb.query("""
select *
from df_geo
""").to_df()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.644820,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP
...,...,...,...,...,...
1000158,99950,-28.068639,-52.010705,tapejara,RS
1000159,99900,-27.877125,-52.224882,getulio vargas,RS
1000160,99950,-28.071855,-52.014716,tapejara,RS
1000161,99980,-28.388932,-51.846871,david canabarro,RS


In [60]:
duckdb.query("""
select count(distinct geolocation_state) no_state
from df_geo
""").to_df()

Unnamed: 0,no_state
0,27


In [61]:
duckdb.query("""
select geolocation_state
, count(distinct geolocation_city) no_city
, count(distinct geolocation_zip_code_prefix) no_zip
from df_geo
group by geolocation_state
order by no_city desc, no_zip desc
""").to_df()

Unnamed: 0,geolocation_state,no_city,no_zip
0,MG,1426,1868
1,SP,1048,6349
2,RS,691,1132
3,BA,652,992
4,PR,651,1046
5,SC,420,620
6,GO,384,773
7,MA,299,313
8,PI,278,307
9,PE,267,596


olist_order_items_dataset

In [62]:
duckdb.query("""
select *
from df_order_item
""").to_df()

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.90,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.90,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.00,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.90,18.14
...,...,...,...,...,...,...,...
112645,fffc94f6ce00a00581880bf54a75a037,1,4aa6014eceb682077f9dc4bffebc05b0,b8bc237ba3788b23da09c0f1f3a3288c,2018-05-02 04:11:01,299.99,43.41
112646,fffcd46ef2263f404302a634eb57f7eb,1,32e07fd915822b0765e448c4dd74c828,f3c38ab652836d21de61fb8314b69182,2018-07-20 04:31:48,350.00,36.53
112647,fffce4705a9662cd70adb13d4a31832d,1,72a30483855e2eafc67aee5dc2560482,c3cfdc648177fdbbbb35635a37472c53,2017-10-30 17:14:25,99.90,16.95
112648,fffe18544ffabc95dfada21779c9644f,1,9c422a519119dcad7575db5af1ba540e,2b3e4a2a3ea8e01938cabda2a3e5cc79,2017-08-21 00:04:32,55.99,8.72


In [63]:
duckdb.query("""
select count(distinct order_id) no_order
from df_order_item
""").to_df()

Unnamed: 0,no_order
0,98666


In [64]:
duckdb.query("""
select order_id, count(distinct product_id) no_product
from df_order_item
group by order_id
order by no_product desc
""").to_df()

Unnamed: 0,order_id,no_product
0,ca3625898fbd48669d50701aba51cd5f,8
1,7d8f5bfd5aff648220374a2df62e84d5,7
2,ad850e69fce9a512ada84086651a2e7d,7
3,77df84f9195be22a4e9cb72ca9e8b4c2,7
4,200f4d883fcc701355e46b8c6035743f,6
...,...,...
98661,e82323311535cbac6219860ae75532d3,1
98662,e8250a89f967d49af320330c8660a9a2,1
98663,e82f641c078d554da5b76c9ebc37265d,1
98664,e85ade63f3d697aa6c45970baeff0d96,1


In [65]:
duckdb.query("""
select *
from df_order_item
where order_id = 'ca3625898fbd48669d50701aba51cd5f'
""").to_df()

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,ca3625898fbd48669d50701aba51cd5f,1,1065e0ebef073787a7bf691924c60eeb,0b35c634521043bf4b47e21547b99ab5,2018-08-16 02:25:07,309.0,1.84
1,ca3625898fbd48669d50701aba51cd5f,2,0cf2faf9749f53924cea652a09d8e327,0b35c634521043bf4b47e21547b99ab5,2018-08-16 02:25:07,33.9,1.84
2,ca3625898fbd48669d50701aba51cd5f,3,0de59eddc63167215c972b0d785ffa7b,0b35c634521043bf4b47e21547b99ab5,2018-08-16 02:25:07,159.0,3.67
3,ca3625898fbd48669d50701aba51cd5f,4,0de59eddc63167215c972b0d785ffa7b,0b35c634521043bf4b47e21547b99ab5,2018-08-16 02:25:07,159.0,3.67
4,ca3625898fbd48669d50701aba51cd5f,5,5dae498eff2d80057f56122235a36aff,888faa8bfb0b159c37de6d898b961c31,2018-08-17 02:25:07,95.9,0.15
5,ca3625898fbd48669d50701aba51cd5f,6,4a5c3967bfd3629fe07ef4d0cc8c3818,0b35c634521043bf4b47e21547b99ab5,2018-08-16 02:25:07,109.9,0.15
6,ca3625898fbd48669d50701aba51cd5f,7,678c229b41c0e497d35a25a8be1cc631,888faa8bfb0b159c37de6d898b961c31,2018-08-17 02:25:07,95.9,0.15
7,ca3625898fbd48669d50701aba51cd5f,8,21b524c4c060169fa75ccf08c7da4627,0b35c634521043bf4b47e21547b99ab5,2018-08-16 02:25:07,63.7,0.15
8,ca3625898fbd48669d50701aba51cd5f,9,309dd69eb83cea38c51709d62befe1a4,0b35c634521043bf4b47e21547b99ab5,2018-08-16 02:25:07,56.0,3.68
9,ca3625898fbd48669d50701aba51cd5f,10,309dd69eb83cea38c51709d62befe1a4,0b35c634521043bf4b47e21547b99ab5,2018-08-16 02:25:07,56.0,3.68


In [66]:
duckdb.query("""
select count(distinct product_id) no_product_in_order
from df_order_item
""").to_df()

Unnamed: 0,no_product_in_order
0,32951


In [67]:
duckdb.query("""
select product_id, sum(order_item_id) no_item
from df_order_item
group by product_id
order by no_item desc
""").to_df()

Unnamed: 0,product_id,no_item
0,422879e10f46682990de24d770e7f83d,793.0
1,aca2eb7d00ea1a7b8ebd4e68314663af,640.0
2,368c6c730842d78016ad823897a372db,551.0
3,53759a2ecddad2bb87a079a1f1519f73,545.0
4,99a4788cb24856965c36a24e339b6058,542.0
...,...,...
32946,27e90baf9d996277a5299e5d828be218,1.0
32947,a2abc65fbaa1761f0c6a88512db4719f,1.0
32948,beefd66ae7d857b9c50868bb14473a86,1.0
32949,071930404657f7cd416f75bd063a9d4b,1.0


In [68]:
duckdb.query("""
select product_id
, min(price) min_price, max(price) max_price, avg(price) avg_price
from df_order_item
group by product_id
order by avg_price desc
""").to_df()

Unnamed: 0,product_id,min_price,max_price,avg_price
0,489ae2aa008f021502940f251d4cce7f,6735.00,6735.00,6735.00
1,69c590f7ffc7bf8db97190b6cb6ed62e,6729.00,6729.00,6729.00
2,1bdf5e6731585cf01aa8169c7028d6ad,6499.00,6499.00,6499.00
3,a6492cc69376c469ab6f61d8f44de961,4799.00,4799.00,4799.00
4,c3ed642d592594bb648ff4a04cee2747,4690.00,4690.00,4690.00
...,...,...,...,...
32946,310dc32058903b6416c71faff132df9e,2.29,2.29,2.29
32947,46fce52cef5caa7cc225a5531c946c8b,2.20,2.20,2.20
32948,270516a3f41dc035aa87d220228f844c,1.20,1.20,1.20
32949,05b515fdc76e888aada3c6d66c201dff,1.20,1.20,1.20


olist_order_payments_dataset

In [69]:
duckdb.query("""
select *
from df_order_payment
""").to_df()

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45
...,...,...,...,...,...
103881,0406037ad97740d563a178ecc7a2075c,1,boleto,1,363.31
103882,7b905861d7c825891d6347454ea7863f,1,credit_card,2,96.80
103883,32609bbb3dd69b3c066a6860554a77bf,1,credit_card,1,47.77
103884,b8b61059626efa996a60be9bb9320e10,1,credit_card,5,369.54


In [70]:
duckdb.query("""
select payment_type
, count(distinct order_id) as no_order
from df_order_payment
group by payment_type
""").to_df()

Unnamed: 0,payment_type,no_order
0,not_defined,3
1,debit_card,1528
2,boleto,19784
3,voucher,3866
4,credit_card,76505


In [71]:
duckdb.query("""
select payment_type, sum(payment_sequential) as no_seq
from df_order_payment
group by payment_type
""").to_df()

Unnamed: 0,payment_type,no_seq
0,not_defined,3.0
1,credit_card,77115.0
2,boleto,19785.0
3,debit_card,1582.0
4,voucher,15029.0


In [72]:
duckdb.query("""
select payment_type, avg(payment_installments) as avg_installments_month
from df_order_payment
group by payment_type
""").to_df()

Unnamed: 0,payment_type,avg_installments_month
0,credit_card,3.507155
1,boleto,1.0
2,debit_card,1.0
3,voucher,1.0
4,not_defined,1.0


In [73]:
duckdb.query("""
select payment_type, sum(payment_value/1000) as sum_payment_value_k
from df_order_payment
group by payment_type
""").to_df()

Unnamed: 0,payment_type,sum_payment_value_k
0,not_defined,0.0
1,boleto,2869.36127
2,debit_card,217.98979
3,voucher,379.43687
4,credit_card,12542.08419


olist_order_reviews_dataset

In [74]:
duckdb.query("""
select *
from df_order_review
""").to_df()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53
...,...,...,...,...,...,...,...
99219,574ed12dd733e5fa530cfd4bbf39d7c9,2a8c23fee101d4d5662fa670396eb8da,5,,,2018-07-07 00:00:00,2018-07-14 17:18:30
99220,f3897127253a9592a73be9bdfdf4ed7a,22ec9f0669f784db00fa86d035cf8602,5,,,2017-12-09 00:00:00,2017-12-11 20:06:42
99221,b3de70c89b1510c4cd3d0649fd302472,55d4004744368f5571d1f590031933e4,5,,"Excelente mochila, entrega super rápida. Super...",2018-03-22 00:00:00,2018-03-23 09:10:43
99222,1adeb9d84d72fe4e337617733eb85149,7725825d039fc1f0ceb7635e3f7d9206,4,,,2018-07-01 00:00:00,2018-07-02 12:59:13


In [75]:
duckdb.query("""
select distinct review_score
from df_order_review
""").to_df()

Unnamed: 0,review_score
0,5
1,2
2,4
3,1
4,3


In [76]:
duckdb.query("""
select avg(review_score) avg_score
from df_order_review
""").to_df()

Unnamed: 0,avg_score
0,4.086421


olist_orders_dataset

In [77]:
duckdb.query("""
select *
from df_order
""").to_df()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00
...,...,...,...,...,...,...,...,...
99436,9c5dedf39a927c1b2549525ed64a053c,39bd1228ee8140590ac3aca26f2dfe00,delivered,2017-03-09 09:54:05,2017-03-09 09:54:05,2017-03-10 11:18:03,2017-03-17 15:08:01,2017-03-28 00:00:00
99437,63943bddc261676b46f01ca7ac2f7bd8,1fca14ff2861355f6e5f14306ff977a7,delivered,2018-02-06 12:58:58,2018-02-06 13:10:37,2018-02-07 23:22:42,2018-02-28 17:37:56,2018-03-02 00:00:00
99438,83c1379a015df1e13d02aae0204711ab,1aa71eb042121263aafbe80c1b562c9c,delivered,2017-08-27 14:46:43,2017-08-27 15:04:16,2017-08-28 20:52:26,2017-09-21 11:24:17,2017-09-27 00:00:00
99439,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,delivered,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15 00:00:00


In [78]:
duckdb.query("""
select order_status, count(distinct order_id) no_order
from df_order
group by order_status
""").to_df()

Unnamed: 0,order_status,no_order
0,shipped,1107
1,invoiced,314
2,approved,2
3,canceled,625
4,delivered,96478
5,created,5
6,unavailable,609
7,processing,301


olist_products_dataset

In [79]:
duckdb.query("""
select *
from df_product
""").to_df()

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0
...,...,...,...,...,...,...,...,...,...
32946,a0b7d5a992ccda646f2d34e418fff5a0,moveis_decoracao,45.0,67.0,2.0,12300.0,40.0,40.0,40.0
32947,bf4538d88321d0fd4412a93c974510e6,construcao_ferramentas_iluminacao,41.0,971.0,1.0,1700.0,16.0,19.0,16.0
32948,9a7c6041fa9592d9d9ef6cfe62a71f8c,cama_mesa_banho,50.0,799.0,1.0,1400.0,27.0,7.0,27.0
32949,83808703fc0706a22e264b9d75f04a2e,informatica_acessorios,60.0,156.0,2.0,700.0,31.0,13.0,20.0


In [80]:
duckdb.query("""
select count(product_id), count(distinct product_id)
from df_product
""").to_df()

Unnamed: 0,count(product_id),count(DISTINCT product_id)
0,32951,32951


olist_sellers_dataset

In [81]:
duckdb.query("""
select *
from df_seller
""").to_df()

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP
...,...,...,...,...
3090,98dddbc4601dd4443ca174359b237166,87111,sarandi,PR
3091,f8201cab383e484733266d1906e2fdfa,88137,palhoca,SC
3092,74871d19219c7d518d0090283e03c137,4650,sao paulo,SP
3093,e603cf3fec55f8697c9059638d6c8eb5,96080,pelotas,RS


In [82]:
duckdb.query("select count(seller_id), count(distinct seller_id) from df_seller").to_df()

Unnamed: 0,count(seller_id),count(DISTINCT seller_id)
0,3095,3095


product_category_name_translation

In [83]:
duckdb.query("""
select *
from df_cat_name
""").to_df()

Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor
...,...,...
66,flores,flowers
67,artes_e_artesanato,arts_and_craftmanship
68,fraldas_higiene,diapers_and_hygiene
69,fashion_roupa_infanto_juvenil,fashion_childrens_clothes


In [84]:
duckdb.query("""
select distinct product_category_name_english
from df_cat_name
""").to_df()

Unnamed: 0,product_category_name_english
0,fixed_telephony
1,cool_stuff
2,air_conditioning
3,furniture_living_room
4,fashion_sport
...,...
66,construction_tools_construction
67,market_place
68,food
69,computers


olist_age

In [85]:
duckdb.query("""
select *
from df_age
""").to_df()

Unnamed: 0,customer_id,age
0,00012a2ce6f8dcda20d059ce98491703,61
1,000161a058600d5901f007fab4c27140,49
2,0001fd6190edaaf884bcaf3d49edf079,71
3,0002414f95344307404f0ace7a26f1d5,68
4,000379cdec625522490c315e70c7a9fb,61
...,...,...
98661,fffcb937e9dd47a13f05ecb8290f4d3e,33
98662,fffecc9f79fd8c764f843e9951b11341,49
98663,fffeda5b6d849fbd39689bb92087f431,27
98664,ffff42319e9b2d713724ae527742af25,67


In [86]:
duckdb.query("""
select min(age) min_age, max(age) max_age, avg(age) avg_age
from df_age
""").to_df()

Unnamed: 0,min_age,max_age,avg_age
0,18,75,46.591835
