# Exploratory Data Analysis

In [None]:
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv
import polars as pl
import plotly.express as px

# Load environment variables and create database engine
load_dotenv()
engine = create_engine(os.environ["DATABASE_URL"])


## Order and Revenue Analysis

To get an overview of our data, we can count the total number of orders and of items ordered in the online store. We can look at the schema of the polars dataframe to check if it correctly loaded the data. 

We might also be interested in some summary statistics, like the time range of our records, and total revenue in Brazilian reais (consisting of item prices + freight values, i.e. shipping costs).

In [None]:
with engine.connect() as conn:
    n_orders = conn.execute(text("SELECT COUNT(*) FROM olist.orders")).scalar()
    print("Orders in DB:", n_orders)

    n_ordered_items = conn.execute(text("SELECT COUNT(*) FROM olist.order_items")).scalar()
    print("Ordered items in DB:", n_ordered_items)

    df = pl.read_database("SELECT * FROM olist.order_items_expanded", engine)
    display(df.head())

    first = df.select(pl.col("order_purchase_timestamp").min()).item()
    last = df.select(pl.col("order_purchase_timestamp").max()).item()
    duration = last - first
    print(f"Order purchase timestamps range from {first} to {last} ({duration.days} days)")

    price_sum = df.select(pl.col("price").sum()).item()
    freight_sum = df.select(pl.col("freight_value").sum()).item()
    print(f"Total price of all order items: R$ {price_sum:,.0f}")
    print(f"Total freight value of all order items: R$ {freight_sum:,.0f}")
    print(f"Total revenue (price + freight): R$ {price_sum + freight_sum:,.0f}")

Orders in DB: 99441
Ordered items in DB: 112650


order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,order_status,order_purchase_timestamp,order_delivered_customer_date,customer_id
str,i64,str,str,datetime[μs],"decimal[38,2]","decimal[38,2]",str,datetime[μs],datetime[μs],str
"""00010242fe8c5a6d1ba2dd792cb162…",1,"""4244733e06e7ecb4970a6e2683c13e…","""48436dade18ac8b2bce089ec2a0412…",2017-09-19 09:45:35,58.9,13.29,"""delivered""",2017-09-13 08:59:02,2017-09-20 23:43:48,"""3ce436f183e68e07877b285a838db1…"
"""00018f77f2f0320c557190d7a144bd…",1,"""e5f2d52b802189ee658865ca93d83a…","""dd7ddc04e1b6c2c614352b383efe2d…",2017-05-03 11:05:13,239.9,19.93,"""delivered""",2017-04-26 10:53:06,2017-05-12 16:04:24,"""f6dd3ec061db4e3987629fe6b26e5c…"
"""000229ec398224ef6ca0657da4fc70…",1,"""c777355d18b72b67abbeef9df44fd0…","""5b51032eddd242adc84c38acab88f2…",2018-01-18 14:48:30,199.0,17.87,"""delivered""",2018-01-14 14:33:31,2018-01-22 13:19:16,"""6489ae5e4333f3693df5ad4372dab6…"
"""00024acbcdf0a6daa1e931b038114c…",1,"""7634da152a4610f1595efa32f14722…","""9d7a1d34a5052409006425275ba1c2…",2018-08-15 10:10:18,12.99,12.79,"""delivered""",2018-08-08 10:00:35,2018-08-14 13:32:39,"""d4eb9395c8c0431ee92fce09860c5a…"
"""00048cc3ae777c65dbb7d2a0634bc1…",1,"""ef92defde845ab8450f9d70c526ef7…","""6426d21aca402a131fc0a5d0960a3c…",2017-05-23 03:55:27,21.9,12.69,"""delivered""",2017-05-15 21:42:34,2017-05-22 13:44:35,"""816cbea969fe5b689b39cfc97a5067…"


Order purchase timestamps range from 2016-09-04 21:15:19 to 2018-09-03 09:06:57 (728 days)
Total price of all order items: R$13,591,644
Total freight value of all order items: R$2,251,910
Total revenue (price + freight): R$15,843,553


### Monthly revenue

Of interest to any online retailer is the monthly revenue. Here, we visualize it both as a heatmap and grouped line chart. Additionally, the plot of the cumulative revenue tells us the performance of sales over the years.  

In [36]:
with engine.connect() as conn:
    df_monthly_revenue = pl.read_database("SELECT * FROM olist.monthly_revenue", conn)
    df_monthly_revenue = df_monthly_revenue.with_columns(pl.col("datetime_month").dt.month().alias("month"))
    df_monthly_revenue = df_monthly_revenue.with_columns(pl.col("datetime_month").dt.year().alias("year"))
    df_monthly_revenue = df_monthly_revenue.with_columns(pl.col("total_revenue").cum_sum().alias("cumulative_revenue"))

    # As a heatmap
    df_pivot = df_monthly_revenue.to_pandas().pivot(values="total_revenue", index="year", columns="month")
    px.imshow(df_pivot.to_numpy(), 
              labels={"x": "Month", "y": "Year", "color": "Total Revenue"}, 
              x=df_pivot.columns.astype(str), 
              y=df_pivot.index.astype(str),
              title="Monthly Revenue Heatmap").show()

    px.line(df_monthly_revenue, x="month", y="total_revenue", color="year", title="Monthly Revenue Over Time", markers=True).show()

    px.line(df_monthly_revenue, x="datetime_month", y="cumulative_revenue", title="Cumulative Revenue Over Time", markers=True).show()