In [1]:
# Import libraries

import psycopg2
from sqlalchemy import create_engine
from urllib.parse import quote_plus
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Create database connection

password = quote_plus('XXXX@XXXXX')

conn = create_engine(
    f'postgresql+psycopg2://postgres:{password}@localhost:5432/olist_db'
)

In [3]:
# Save query as string

query = '''
WITH delivered_orders AS (
	SELECT
		cohort_year,
		order_purchase_timestamp,
		is_reviewed
	FROM vw_order_customer
	WHERE
		is_approved = 1
		AND is_delivered = 1
),
agg AS (
	SELECT
		cohort_year,
		DATE_TRUNC('quarter', order_purchase_timestamp) AS order_quarter,
		COUNT(*) AS total_delivered_cnt,
		SUM(CASE WHEN is_reviewed = 0 THEN 1 ELSE 0 END) AS not_reviewed_cnt
	FROM delivered_orders
	GROUP BY 
		cohort_year,
		order_quarter
)

SELECT
	cohort_year,
	order_quarter,
	total_delivered_cnt,
	not_reviewed_cnt,
	ROUND((100 * not_reviewed_cnt::NUMERIC / total_delivered_cnt), 2) AS pct_vs_delivered_orders
FROM agg
ORDER BY
	cohort_year,
	order_quarter;
'''

In [4]:
# Execute SQL query and load into DataFrame
df = pd.read_sql(query, conn)

# Make sure cohort_year is treated as an integer for sorting and plotting
df['cohort_year'] = df['cohort_year'].astype('int')
# Convert order_quarter to a readable quarter label
df['quarter_label'] = df['order_quarter'].dt.to_period('Q').astype('str')

df

Unnamed: 0,cohort_year,order_quarter,total_delivered_cnt,not_reviewed_cnt,pct_vs_delivered_orders,quarter_label
0,2016,2016-07-01,1,0,0.0,2016Q3
1,2016,2016-10-01,266,3,1.13,2016Q4
2,2016,2017-01-01,1,0,0.0,2017Q1
3,2016,2017-04-01,1,0,0.0,2017Q2
4,2016,2017-07-01,2,0,0.0,2017Q3
5,2016,2017-10-01,1,0,0.0,2017Q4
6,2016,2018-01-01,2,0,0.0,2018Q1
7,2016,2018-04-01,4,0,0.0,2018Q2
8,2017,2017-01-01,4934,38,0.77,2017Q1
9,2017,2017-04-01,8983,65,0.72,2017Q2


In [5]:
# Pivot table to show % of not-reviewed delivered orders by cohort and quarter
table = df.pivot_table(
    index='cohort_year',
    columns='quarter_label',
    values='pct_vs_delivered_orders',
    fill_value=0
)

# Hide text when there is no data (0 = no data, not 0%)
def annot_format(x):
    return "" if x == 0 else f"{x:.2f}%"

# Style the table for better readability
styled_table = (
    table.style
    # Green gradient to highlight higher percentages
    .background_gradient(
        cmap='Greens',
        vmin=0,
        vmax=3
    )
    # Apply custom text formatting
    .format(annot_format)
    # Table title
    .set_caption(
        "Share of Delivered Orders Not Reviewed by Cohort and Quarter "
        "(blank = no data)"
    )
)

styled_table

quarter_label,2016Q3,2016Q4,2017Q1,2017Q2,2017Q3,2017Q4,2018Q1,2018Q2,2018Q3
cohort_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016,,1.13%,,,,,,,
2017,,,0.77%,0.72%,0.74%,0.78%,2.39%,1.11%,0.74%
2018,,,,,,,0.74%,0.49%,0.47%
