<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Análise-Exploratória-de-Dados" data-toc-modified-id="Análise-Exploratória-de-Dados-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Análise Exploratória de Dados</a></span><ul class="toc-item"><li><span><a href="#Séries-temporais" data-toc-modified-id="Séries-temporais-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Séries temporais</a></span></li><li><span><a href="#Relação-entre-duas-variáveis-continuas" data-toc-modified-id="Relação-entre-duas-variáveis-continuas-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Relação entre duas variáveis continuas</a></span></li><li><span><a href="#Relação-entre-uma-variável-continua-e-uma-categórica" data-toc-modified-id="Relação-entre-uma-variável-continua-e-uma-categórica-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Relação entre uma variável continua e uma categórica</a></span></li><li><span><a href="#Relação-entre-múltiplas-variáveis-continuas" data-toc-modified-id="Relação-entre-múltiplas-variáveis-continuas-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Relação entre múltiplas variáveis continuas</a></span></li></ul></li></ul></div>

In [None]:
import psycopg2
import numpy as np
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
from sqlalchemy import create_engine


engine = create_engine('postgresql+psycopg2://ironhack:123456@localhost:5432/ironhack')

# Análise Exploratória de Dados

## Séries temporais

In [None]:
query_diaria = '''
SELECT
	date(ood.order_purchase_timestamp) AS data_pedido,
	count(*) AS num_pedidos
FROM
	ironhack.olist_db.olist_orders_dataset ood
WHERE
	ood.order_status = 'delivered' AND
	ood.order_purchase_timestamp > '2017-01-01'
GROUP BY
	data_pedido
ORDER BY 
	data_pedido;
'''

In [None]:
tb_numped_diario = pd.read_sql_query(query_diaria, engine)

In [None]:
tb_numped_diario

In [None]:
plt.figure(figsize=(12, 4))
sns.lineplot(data=tb_numped_diario, x='data_pedido', y='num_pedidos')

In [None]:
query_diaria_atraso = '''
SELECT
	date(ood.order_purchase_timestamp) AS data_pedido,
	tb_atraso.status_atraso,
	count(*) AS num_pedidos
FROM
	ironhack.olist_db.olist_orders_dataset ood JOIN
	(
	SELECT
		ood.order_id,
		ood.order_estimated_delivery_date AS data_estimada,
		ood.order_delivered_customer_date AS data_entregue,
		date_part('DAYS', ood.order_delivered_customer_date - ood.order_estimated_delivery_date) AS dias_atraso,
		CASE
			WHEN date_part('DAYS', ood.order_delivered_customer_date - ood.order_estimated_delivery_date) > 0 THEN 'ATRASADO'
			WHEN date_part('DAYS', ood.order_delivered_customer_date - ood.order_estimated_delivery_date) <= 0 THEN 'ONTIME'
			ELSE 'NAO ENTREGUE' END AS status_atraso
	FROM
		ironhack.olist_db.olist_orders_dataset ood
	) AS tb_atraso ON (tb_atraso.order_id = ood.order_id)
WHERE
	ood.order_status = 'delivered' AND
	ood.order_purchase_timestamp > '2017-01-01'
GROUP BY
	data_pedido,
	status_atraso
ORDER BY
	status_atraso,
	data_pedido;
'''

In [None]:
tb_numped_diario_atraso = pd.read_sql_query(query_diaria_atraso, engine)

In [None]:
plt.figure(figsize=(12, 4))
sns.lineplot(data=tb_numped_diario_atraso,
             x='data_pedido',
             y='num_pedidos',
             hue='status_atraso')

In [None]:
tb_atraso_diario = tb_numped_diario_atraso.pivot_table(values='num_pedidos',
                                                       index='data_pedido',
                                                       columns='status_atraso',
                                                       fill_value=0)

In [None]:
tb_atraso_diario['num_pedidos'] = (tb_atraso_diario['ONTIME']+tb_atraso_diario['ATRASADO'])
tb_atraso_diario['per_atraso'] = tb_atraso_diario['ATRASADO']/tb_atraso_diario['num_pedidos']

In [None]:
tb_atraso_diario

In [None]:
fig, ax = plt.subplots(2, 1,figsize=(12, 8))
sns.lineplot(data=tb_atraso_diario,
             x='data_pedido',
             y='per_atraso',
            ax = ax[1])
sns.lineplot(data=tb_numped_diario, x='data_pedido', y='num_pedidos', ax = ax[0])

## Relação entre duas variáveis continuas

In [None]:
sns.scatterplot(data=tb_atraso_diario, x='num_pedidos', y='per_atraso')

In [None]:
query_atraso_itens = '''
SELECT
	date(c1.order_purchase_timestamp) AS data_pedido,
	c1.status_atraso,
	CASE WHEN c1.seller_state = c1.customer_state THEN 'MESMO_ESTADO'
		 ELSE 'OUTRO_ESTADO' END AS tipo_entrega,
	count(*) AS num_itens
FROM
	ironhack.olist_db.case_1 c1
WHERE
	c1.order_purchase_timestamp > '2017-01-01'
GROUP BY
	data_pedido,
	status_atraso,
	tipo_entrega
ORDER BY
	status_atraso,
	data_pedido,
	tipo_entrega;
'''

In [None]:
tb_itens_atraso = pd.read_sql_query(query_atraso_itens, engine)

In [None]:
tb_itens_atraso

In [None]:
tb_itens_atraso_pt = tb_itens_atraso.pivot_table(
    values='num_itens',
    index=['data_pedido', 'tipo_entrega'],
    columns='status_atraso',
    fill_value=0).reset_index()

In [None]:
tb_itens_atraso_pt

In [None]:
sns.scatterplot(data=tb_itens_atraso_pt, x='ONTIME', y='ATRASADO', hue = 'tipo_entrega', alpha = 0.5)

In [None]:
tb_itens_atraso_pt_f = tb_itens_atraso_pt[tb_itens_atraso_pt['ONTIME'] < 350]
tb_itens_atraso_pt_f = tb_itens_atraso_pt_f[tb_itens_atraso_pt_f['ONTIME'] > 50]
tb_itens_atraso_pt_f = tb_itens_atraso_pt_f[tb_itens_atraso_pt_f['ATRASADO'] > 0]
tb_itens_atraso_pt_f['log_atraso'] = np.log(tb_itens_atraso_pt_f['ATRASADO'])
tb_itens_atraso_pt_f['log_ontime'] = np.log(tb_itens_atraso_pt_f['ONTIME'])

In [None]:
sns.lmplot(data=tb_itens_atraso_pt_f, x='log_ontime', y='log_atraso', hue = 'tipo_entrega', scatter_kws={"s": 5, "alpha": 0.3})

## Relação entre uma variável continua e uma categórica

In [None]:
query_review_atraso = '''
SELECT
	c1.order_id,
	c1.product_id,
	c1.status_atraso,
	c1.review_score,
	CASE WHEN c1.seller_state = c1.customer_state THEN 'MESMO_ESTADO'
		 ELSE 'OUTRO_ESTADO' END AS tipo_entrega
FROM
	ironhack.olist_db.case_1 c1
WHERE
	c1.order_purchase_timestamp > '2017-01-01'
'''

In [None]:
tb_review_atraso = pd.read_sql_query(query_review_atraso, engine)

In [None]:
tb_review_atraso

In [None]:
sns.boxplot(data = tb_review_atraso, x = 'status_atraso', y = 'review_score')

## Relação entre múltiplas variáveis continuas

In [None]:
query_diaria_categoria = '''
SELECT
	date(ood.order_purchase_timestamp) AS data_pedido,
	opd.product_category_name, 
	count(*) AS num_pedidos
FROM
	ironhack.olist_db.olist_orders_dataset ood JOIN
	ironhack.olist_db.olist_order_items_dataset ooid ON (ood.order_id = ooid.order_id) JOIN 
	ironhack.olist_db.olist_products_dataset opd ON (ooid.product_id = opd.product_id) JOIN 
	(
		SELECT
			opd.product_category_name,
			ROW_NUMBER() OVER (ORDER BY SUM(ooid.price) DESC) as rank_categoria
		FROM 
			ironhack.olist_db.olist_order_items_dataset ooid JOIN
			ironhack.olist_db.olist_products_dataset opd ON (ooid.product_id = opd.product_id)
		WHERE 
			opd.product_category_name IS NOT NULL 
		GROUP BY 
			opd.product_category_name
	) AS tc ON (tc.product_category_name = opd.product_category_name)
WHERE
	ood.order_status = 'delivered' AND
	ood.order_purchase_timestamp > '2017-01-01' AND
	tc.rank_categoria <= 5
GROUP BY
	data_pedido,
	opd.product_category_name
ORDER BY 
	data_pedido,
	opd.product_category_name;
'''

In [None]:
tb_diaria_categoria = pd.read_sql_query(query_diaria_categoria, engine)

In [None]:
tb_diaria_categoria_pt = tb_diaria_categoria.pivot_table(
    index='data_pedido',
    columns='product_category_name',
    values='num_pedidos',
    fill_value=0)

In [None]:
sns.pairplot(tb_diaria_categoria_pt)

In [None]:
mask = np.triu(np.ones_like(tb_diaria_categoria_pt.corr(), dtype=bool))
plt.figure(figsize=(12, 9));
heatmap = sns.heatmap(tb_diaria_categoria_pt.corr(), mask = mask);

In [None]:
tb_diaria_categoria_total = pd.merge(
    tb_numped_diario,
    tb_diaria_categoria.rename({'num_pedidos': 'num_pedidos_categoria'},
                               axis=1),
    on='data_pedido')

In [None]:
tb_diaria_categoria_total['per_categoria'] = tb_diaria_categoria_total['num_pedidos_categoria']/tb_diaria_categoria_total['num_pedidos']

In [None]:
tb_diaria_mix_pt = tb_diaria_categoria_total.pivot_table(
    index='data_pedido',
    columns='product_category_name',
    values='per_categoria',
    fill_value=0)
sns.pairplot(tb_diaria_mix_pt)

In [None]:
mask = np.triu(np.ones_like(tb_diaria_mix_pt.corr(), dtype=bool))
plt.figure(figsize=(12, 9));
heatmap = sns.heatmap(tb_diaria_mix_pt.corr(), mask = mask);

In [None]:
query_seller_tc = '''
WITH top_categories AS (
	SELECT 
		tc.product_category_name
	FROM
	(
		SELECT
			opd.product_category_name,
			ROW_NUMBER() OVER (ORDER BY SUM(ooid.price) DESC) as rank_categoria
		FROM 
			ironhack.olist_db.olist_order_items_dataset ooid JOIN
			ironhack.olist_db.olist_products_dataset opd ON (ooid.product_id = opd.product_id)
		WHERE 
			opd.product_category_name IS NOT NULL 
		GROUP BY 
			opd.product_category_name
	) AS tc
	WHERE rank_categoria <= 20
)
SELECT
	osd.seller_id,
	osd.seller_state,
	osd.seller_city,
	opd.product_category_name,
	SUM(ooid.price) AS rb_categoria
FROM 
	ironhack.olist_db.olist_sellers_dataset osd JOIN
	ironhack.olist_db.olist_order_items_dataset ooid ON (osd.seller_id = ooid.seller_id) JOIN
	ironhack.olist_db.olist_products_dataset opd ON (ooid.product_id = opd.product_id) JOIN
	top_categories tc ON (tc.product_category_name = opd.product_category_name)
GROUP BY
	osd.seller_id,
	osd.seller_state,
	osd.seller_city,
	opd.product_category_name
'''

In [None]:
tb_seller_categoria = pd.read_sql_query(query_seller_tc, engine)

In [None]:
tb_seller_categoria

In [None]:
pivot_seller_cat = tb_seller_categoria.pivot_table(values = 'rb_categoria', 
                                                   index = 'seller_id', 
                                                   columns = 'product_category_name')
pivot_seller_cat = pivot_seller_cat.fillna(0)

In [None]:
mask = np.triu(np.ones_like(pivot_seller_cat.corr(), dtype=bool))
plt.figure(figsize=(12, 9));
heatmap = sns.heatmap(pivot_seller_cat.corr(), center = 0.5, vmin = 0, mask = mask);
heatmap.set_title('Mapa de Correlação', fontdict={'fontsize':18}, pad=16);

In [None]:
sns.clustermap(pivot_seller_cat.corr(), center = 0.5, vmin = 0);