<a href="https://colab.research.google.com/github/mesters-work/olist-analytics/blob/main/olist_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load raw Olist datasets
orders = pd.read_csv("/content/olist_data/olist_orders_dataset.csv")
items = pd.read_csv("/content/olist_data/olist_order_items_dataset.csv")
customers = pd.read_csv("/content/olist_data/olist_customers_dataset.csv")
# Convert dates
orders['purchase_date'] = pd.to_datetime(orders['order_purchase_timestamp'])
orders['delivered_date'] = pd.to_datetime(orders['order_delivered_customer_date'])
orders['estimated_date'] = pd.to_datetime(orders['order_estimated_delivery_date'])

# Delivery duration
orders['delivery_days'] = (orders['delivered_date'] - orders['purchase_date']).dt.days

# Late flag
orders['late_flag'] = orders['delivered_date'] > orders['estimated_date']

# Order month for aggregation
orders['order_month'] = orders['purchase_date'].dt.to_period('M').astype(str)

# Export 1
orders_clean = orders[['order_id','customer_id','purchase_date','delivered_date',
                       'delivery_days','late_flag','order_month']]
orders_clean.to_csv("/content/olist_orders_clean.csv", index=False)

# Find first purchase month for each customer
orders['cohort_month'] = orders.groupby('customer_id')['purchase_date'].transform('min').dt.to_period('M')

# Period index (months since cohort)
orders['period_index'] = ((orders['purchase_date'].dt.to_period('M') - orders['cohort_month']).apply(lambda x: x.n))

# Cohort size & active customers
cohort_data = (orders.groupby(['cohort_month','period_index'])['customer_id']
                      .nunique()
                      .reset_index(name='active_customers'))

# Compute retention %
cohort_sizes = cohort_data[cohort_data['period_index']==0][['cohort_month','active_customers']].rename(columns={'active_customers':'cohort_size'})
cohort_data = cohort_data.merge(cohort_sizes,on='cohort_month')
cohort_data['retention_rate'] = cohort_data['active_customers'] / cohort_data['cohort_size']

# Export 2
cohort_data.to_csv("/content/customer_cohorts.csv", index=False)

# Join orders to customers to get states
geo = orders.merge(customers[['customer_id','customer_state']], on='customer_id', how='left')

# Group by state
geo_summary = geo.groupby('customer_state').agg(
    total_orders=('order_id','count'),
    late_orders=('late_flag','sum')
).reset_index()

geo_summary['pct_late'] = geo_summary['late_orders'] / geo_summary['total_orders']

# Export 3
geo_summary.to_csv("/content/geo_deliveries.csv", index=False)

