In [18]:
import pandas as pd
import duckdb
import os

In [19]:
%cd /content/drive/MyDrive/github_projects/data_modeling/dataset

/content/drive/MyDrive/github_projects/data_modeling/dataset


In [20]:
# Definir caminho base
data_path = '/content/drive/MyDrive/github_projects/data_modeling/dataset'

# Carregar os CSVs principais
customers = pd.read_csv(os.path.join(data_path, 'olist_customers_dataset.csv'))
orders = pd.read_csv(os.path.join(data_path, 'olist_orders_dataset.csv'))
order_items = pd.read_csv(os.path.join(data_path, 'olist_order_items_dataset.csv'))
products = pd.read_csv(os.path.join(data_path, 'olist_products_dataset.csv'))
sellers = pd.read_csv(os.path.join(data_path, 'olist_sellers_dataset.csv'))
payments = pd.read_csv(os.path.join(data_path, 'olist_order_payments_dataset.csv'))
reviews = pd.read_csv(os.path.join(data_path, 'olist_order_reviews_dataset.csv'))
categories = pd.read_csv(os.path.join(data_path, 'product_category_name_translation.csv'))

geolocation1 = pd.read_csv(os.path.join(data_path, 'olist_geolocation_dataset_part1.csv'))
geolocation2 = pd.read_csv(os.path.join(data_path, 'olist_geolocation_dataset_part2.csv'))
geolocalizacao = pd.concat([geolocation1, geolocation2])


In [21]:
# Criar conexão DuckDB
con = duckdb.connect('/content/drive/MyDrive/github_projects/data_modeling/tabelas_sql/normalizada/tabelas_normalizadas.db')

# Registrar as tabelas no banco
con.register('customers', customers)
con.register('orders', orders)
con.register('order_items', order_items)
con.register('products', products)
con.register('sellers', sellers)
con.register('payments', payments)
con.register('reviews', reviews)
con.register('geolocation', geolocalizacao)
con.register('product_categories', categories)


<duckdb.duckdb.DuckDBPyConnection at 0x7f366063e370>

In [22]:
# tabela: clientes
con.execute("""
CREATE TABLE clientes AS
SELECT DISTINCT
    customer_id,
    customer_unique_id,
    customer_city,
    customer_state
FROM customers;
""")

# tabela: pedidos
con.execute("""
CREATE TABLE pedidos AS
SELECT DISTINCT
    order_id,
    customer_id,
    order_status,
    order_purchase_timestamp,
    order_approved_at,
    order_delivered_carrier_date,
    order_delivered_customer_date,
    order_estimated_delivery_date
FROM orders;
""")

# tabela: itens_pedido
con.execute("""
CREATE TABLE itens_pedido AS
SELECT DISTINCT
    order_id,
    order_item_id,
    product_id,
    seller_id,
    shipping_limit_date,
    price,
    freight_value
FROM order_items;
""")

# tabela: produtos
con.execute("""
CREATE TABLE produtos AS
SELECT DISTINCT
    product_id,
    product_category_name,
    product_name_lenght,
    product_description_lenght,
    product_photos_qty,
    product_weight_g,
    product_length_cm,
    product_length_cm,
    product_width_cm
FROM products;
""")

# tabela: vendedores
con.execute("""
CREATE TABLE vendedores AS
SELECT DISTINCT
    seller_id,
    seller_zip_code_prefix,
    seller_city,
    seller_state
FROM sellers;
""")

# tabela: pagamentos
con.execute("""
CREATE TABLE pagamentos AS
SELECT DISTINCT
    order_id,
    payment_sequential,
    payment_type,
    payment_installments,
    payment_value
FROM payments;
""")

# tabela: reviews
con.execute("""
CREATE TABLE reviews_table AS
SELECT DISTINCT
    review_id,
    order_id,
    review_score,
    review_comment_title,
    review_comment_message,
    review_creation_date,
    review_answer_timestamp
FROM reviews;
""")

# tabela: geolocalizacao
con.execute("""
CREATE TABLE geolocalizacao AS
SELECT DISTINCT
    geolocation_zip_code_prefix,
    geolocation_lat,
    geolocation_lng,
    geolocation_city,
    geolocation_state
FROM geolocation;
""")

# tabela: categorias_produto
con.execute("""
CREATE TABLE categorias_produto AS
SELECT DISTINCT
    product_category_name,
    product_category_name_english
FROM product_categories;
""")




<duckdb.duckdb.DuckDBPyConnection at 0x7f366063e370>

In [23]:
con.execute("show tables;").fetchall()

[('categorias_produto',),
 ('clientes',),
 ('customers',),
 ('geolocalizacao',),
 ('geolocation',),
 ('itens_pedido',),
 ('order_items',),
 ('orders',),
 ('pagamentos',),
 ('payments',),
 ('pedidos',),
 ('product_categories',),
 ('products',),
 ('produtos',),
 ('reviews',),
 ('reviews_table',),
 ('sellers',),
 ('vendedores',)]

In [24]:
# Dropar as tabelas antigas não normalizadas
con.execute("DROP VIEW IF EXISTS customers;")
con.execute("DROP VIEW IF EXISTS orders;")
con.execute("DROP VIEW IF EXISTS order_items;")
con.execute("DROP VIEW IF EXISTS products;")
con.execute("DROP VIEW IF EXISTS sellers;")
con.execute("DROP VIEW IF EXISTS payments;")
con.execute("DROP VIEW IF EXISTS reviews;")
con.execute("DROP VIEW IF EXISTS geolocation;")
con.execute("DROP VIEW IF EXISTS product_categories;")


<duckdb.duckdb.DuckDBPyConnection at 0x7f366063e370>

In [25]:
con.execute("SHOW tables").fetchall()

[('categorias_produto',),
 ('clientes',),
 ('geolocalizacao',),
 ('itens_pedido',),
 ('pagamentos',),
 ('pedidos',),
 ('produtos',),
 ('reviews_table',),
 ('vendedores',)]

In [26]:
# consulta: Total de clientes por estado
display(con.execute("""
SELECT customer_state,
       COUNT(customer_id) AS total_clientes
FROM clientes
GROUP BY customer_state
ORDER BY total_clientes DESC LIMIT 10;""").df())


Unnamed: 0,customer_state,total_clientes
0,SP,41746
1,RJ,12852
2,MG,11635
3,RS,5466
4,PR,5045
5,SC,3637
6,BA,3380
7,DF,2140
8,ES,2033
9,GO,2020


In [27]:
# consulta: Total de pedidos por status
display(con.execute("""
SELECT order_status,
       COUNT(order_id) AS total_pedidos
FROM pedidos
GROUP BY order_status
ORDER BY total_pedidos DESC""").df())


Unnamed: 0,order_status,total_pedidos
0,delivered,96478
1,shipped,1107
2,canceled,625
3,unavailable,609
4,invoiced,314
5,processing,301
6,created,5
7,approved,2


In [28]:
# consulta: Total de vendas por produto

display(con.execute("""SELECT product_id,
       COUNT(order_item_id) AS total_vendas
FROM itens_pedido
GROUP BY product_id
ORDER BY total_vendas DESC
LIMIT 10;""").df())

Unnamed: 0,product_id,total_vendas
0,aca2eb7d00ea1a7b8ebd4e68314663af,527
1,99a4788cb24856965c36a24e339b6058,488
2,422879e10f46682990de24d770e7f83d,484
3,389d119b48cf3043d311335e499d9c6b,392
4,368c6c730842d78016ad823897a372db,388
5,53759a2ecddad2bb87a079a1f1519f73,373
6,d1c427060a0f73f6b889a5c7c61f2ac4,343
7,53b36df67ebb7c41585e8d54d6772e08,323
8,154e7e31ebfa092203795c972e5804a6,281
9,3dd2a17168ec895c781a9191c1e95ad7,274


In [29]:
#consulta: Produtos com maior peso
display(con.execute("""SELECT product_id,
       product_name_lenght,
       product_weight_g
FROM produtos
ORDER BY product_weight_g DESC
LIMIT 10;""").df())

Unnamed: 0,product_id,product_name_lenght,product_weight_g
0,26644690fde745fc4654719c3904e1db,59.0,40425.0
1,f97ad9066c718a6cef93dfcf253d3e0d,63.0,30000.0
2,dcfeedf441c38e5e7e58ffce194af2bb,50.0,30000.0
3,0a859d8dc68f6a746b4709217110c439,50.0,30000.0
4,8250ed49c0929b233a405e3ece4ce328,22.0,30000.0
5,343c15a347e523f2b6cf38a5db81e179,48.0,30000.0
6,0ed9a84687fad2d921b09b0ebcc4cded,53.0,30000.0
7,4abee1df902ca6e48fbe864fce3859bc,48.0,30000.0
8,d239ed6ed6dabbcbfd8a3b776e1ca50c,59.0,30000.0
9,c6fdec160d0f8f488d9041316c85051d,57.0,30000.0


In [30]:
# consulta: Total de pagamentos por tipo de pagamento
display(con.execute("""SELECT payment_type,
       SUM(payment_value) AS total_pago
FROM pagamentos
GROUP BY payment_type
ORDER BY total_pago DESC;""").df())


Unnamed: 0,payment_type,total_pago
0,credit_card,12542080.0
1,boleto,2869361.0
2,voucher,379436.9
3,debit_card,217989.8
4,not_defined,0.0


In [31]:
# consulta: Média de avaliação por produto
display(con.execute("""SELECT oi.product_id,
       AVG(r.review_score) AS media_avaliacao
FROM reviews r
JOIN order_items oi ON r.order_id = oi.order_id
GROUP BY oi.product_id
ORDER BY media_avaliacao DESC;""").df())


Unnamed: 0,product_id,media_avaliacao
0,f832d1b20241274bc51c2d691b0f4b94,5.0
1,5b685cb0e36ab67e60afdaf2d784e2ed,5.0
2,a67862541af347668d69290d8951beeb,5.0
3,86da2c247cca57fe5928cc175e2a65e3,5.0
4,6d034a9fd26095b72dd2c2060af17d5b,5.0
...,...,...
32784,b85f305eec7f31aa9c5f04a3e937571a,1.0
32785,6eefea74d72822573892405c674918e6,1.0
32786,07883aa96515a898e1b9ddf0537a1b56,1.0
32787,bfa8fb32cecbf90899f6f517b77a766f,1.0


In [32]:
con.close()