# Setup and Load Data

These cells initializes our environment and loads the cleaned sales and products datasets that we prepared earlier. We're working with 46M+ sales transactions across 2M customers from Flipkart's e-commerce platform.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

In [2]:
sales = pd.read_csv('sales_cleaned.csv')
products = pd.read_csv('products_cleaned.csv')

In [3]:
print("Columns:", sales.columns.tolist())
pd.set_option('display.max_columns', None)
print(sales.head(20))

Columns: ['date', 'city_name', 'order_id', 'cart_id', 'dim_customer_key', 'procured_quantity', 'unit_selling_price', 'total_discount_amount', 'product_id', 'total_weighted_landing_price']
          date  city_name   order_id    cart_id  dim_customer_key  \
0   2022-04-01     Mumbai  112246974  173273802          17995199   
1   2022-04-01  Bengaluru  112246976  173273597          18259433   
2   2022-04-01  Bengaluru  112247019  173123717           5402601   
3   2022-04-01     HR-NCR  112247045  172547459          15649744   
4   2022-04-01     Mumbai  112247123  173081820          10127605   
5   2022-04-01     HR-NCR  112247149  173274225            479129   
6   2022-04-01  Bengaluru  112247170  173273989           8658748   
7   2022-04-01     Mumbai  112247182  173273662          16541671   
8   2022-04-01      Delhi  112247233  173269263             21466   
9   2022-04-01      Delhi  112247239  173274446          13522664   
10  2022-04-01      Delhi  112247317  173274278      

In [4]:
sales = sales.rename(columns={
    "city_name": "city",
    "dim_customer_key": "customer_id",
    "procured_quantity": "quantity",
    "unit_selling_price": "unit_price",
    "total_discount_amount": "discount",
    "total_weighted_landing_price": "landing_price"
})

In [5]:
print("Columns:", products.columns.tolist())
pd.set_option('display.max_columns', None)
print(products.head(20))

Columns: ['product_id', 'product_name', 'unit', 'product_type', 'brand_name', 'manufacturer_name', 'l0_category', 'l1_category', 'l2_category', 'l0_category_id', 'l1_category_id', 'l2_category_id']
    product_id                                       product_name  \
0       476763                                   Christmas - Card   
1       483436  Plum BodyLovin' Hawaiian Rumba Shower Gel - Sa...   
2       476825                     Diwali Gift Card Free - Sample   
3       483438  Plum BodyLovin' Trippin' Mimosas Shower Gel - ...   
4       480473             Flipkart Valentine Day Greeting - Card   
5       483694         Dabur Vita Chocolate Health Drink - Sample   
6       486016   Plum Green Tea Pore Cleansing Face Wash - Sample   
7       486017       Plum Green Tea Oil-Free Moisturizer - Sample   
8       486124               Kari kari Salt & Pepper Snack Sample   
9       486125                   Maggi Liquid Coconut Milk Sample   
10      486376                       Orion 

In [66]:
products = products.rename(columns={
    "product_name": "name",
    "product_type": "type",
    "brand_name": "brand",
    "manufacturer_name": "manufacturer",
    "l0_category": "category_l0",
    "l1_category": "category_l1",
    "l2_category": "category_l2",
    "l0_category_id": "cat0_id",
    "l1_category_id": "cat1_id",
    "l2_category_id": "cat2_id"
})

In [68]:
print(f"Sales shape: {sales.shape}")
print(f"Products shape: {products.shape}")
print(f"Unique customers: {sales['customer_id'].nunique():,}")
print(f"Unique products sold: {sales['product_id'].nunique():,}")
print(f"Date range: {sales['date'].min()} to {sales['date'].max()}")

Sales shape: (46448124, 10)
Products shape: (33704, 12)
Unique customers: 1,986,574
Unique products sold: 17,243
Date range: 2022-04-01 to 2022-07-10


In [None]:
# Compute revenue, profit, and price range
sales["revenue"] = (sales["unit_price"] * sales["quantity"]) - sales["discount"]
sales["profit"] = sales["revenue"] - sales["landing_price"]

# Define price bins and labels
price_ranges = [0, 50, 100, 250, 500, 1000, 5000, float("inf")]
labels = ["$0-50", "$50-100", "$100-250", "$250-500", "$500-1000", "$1000-5000", "$5000+"]

# Assign price range based on unit price
sales["price_range"] = pd.cut(sales["unit_price"], bins=price_ranges, labels=labels)

# Core Customer Aggregation

Transform transaction-level data (46M rows) into customer-level features (2M rows). This is the foundation - creating one row per customer with their basic purchasing metrics like total orders, revenue, and purchase dates.

In [70]:
customer_agg = sales.groupby('customer_id').agg({
    # Order metrics
    'order_id': 'nunique',
    'product_id': 'nunique',
    
    # Quantity metrics
    'quantity': ['sum', 'mean', 'std'],
    
    # Financial metrics
    'revenue': ['sum', 'mean', 'std', 'min', 'max'],
    'profit': ['sum', 'mean'],
    'discount': ['sum', 'mean', 'max'],
    'unit_price': ['mean', 'std', 'min', 'max'],
    
    # Temporal
    'date': ['min', 'max', 'nunique'],
    'city': 'nunique'
}).reset_index()

customer_agg.columns = ['_'.join(col).strip('_') if col[1] else col[0] 
                        for col in customer_agg.columns]

In [71]:
# Rename for clarity
rename_dict = {
    'order_id_nunique': 'total_orders',
    'product_id_nunique': 'unique_products',
    'quantity_sum': 'total_quantity',
    'quantity_mean': 'avg_quantity_per_order',
    'quantity_std': 'quantity_std',
    'revenue_sum': 'total_revenue',
    'revenue_mean': 'avg_revenue_per_order',
    'revenue_std': 'revenue_std',
    'revenue_min': 'min_order_revenue',
    'revenue_max': 'max_order_revenue',
    'profit_sum': 'total_profit',
    'profit_mean': 'avg_profit_per_order',
    'discount_sum': 'total_discount',
    'discount_mean': 'avg_discount',
    'discount_max': 'max_discount_used',
    'unit_price_mean': 'avg_price_point',
    'unit_price_std': 'price_std',
    'unit_price_min': 'min_price_paid',
    'unit_price_max': 'max_price_paid',
    'date_min': 'first_purchase',
    'date_max': 'last_purchase',
    'date_nunique': 'active_days',
    'city_nunique': 'unique_cities'
}

customer_agg = customer_agg.rename(columns=rename_dict)

In [72]:
# Fill NaN values for customers with single purchases (no std deviation)
for col in ['quantity_std', 'revenue_std', 'price_std']:
    customer_agg[col].fillna(0, inplace=True)

In [73]:
print(f"Aggregation complete: {len(sales):,} transactions → {len(customer_agg):,} customers")
print(f"Compression ratio: {len(sales)/len(customer_agg):.1f}x")
print(f"Features created: {len(customer_agg.columns) - 1}")  # -1 for customer_id

Aggregation complete: 46,448,124 transactions → 1,986,574 customers
Compression ratio: 23.4x
Features created: 23


These features form the foundation for RFM analysis and customer lifetime value calculations

- total_orders, total_revenue: Core metrics for customer value and engagement
- avg_price_point: Indicates price sensitivity and purchasing power
- price_std: Shows if customer is consistent or experimental with price ranges
- unique_products: Measures exploration behavior vs. habitual buying

In [74]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 24)


# Temporal Features and Recency Metrics

Calculate time-based features including customer lifetime, recency (key churn indicator), and recent activity windows. Recency is the #1 predictor of churn - customers who haven't purchased recently are likely to leave.

In [75]:
sales['date'] = pd.to_datetime(sales['date'])
customer_agg['first_purchase'] = pd.to_datetime(customer_agg['first_purchase'], errors='coerce')
customer_agg['last_purchase'] = pd.to_datetime(customer_agg['last_purchase'], errors='coerce')

In [76]:
# Set reference date (last date in dataset)
reference_date = pd.to_datetime('2022-07-10')

# Customer lifetime (how long they've been with us)
customer_agg['customer_lifetime_days'] = (
    customer_agg['last_purchase'] - customer_agg['first_purchase']
).dt.days

# Recency (days since last purchase - critical for churn)
customer_agg['recency_days'] = (
    reference_date - customer_agg['last_purchase']
).dt.days

# Core RFM values
customer_agg['recency'] = customer_agg['recency_days']
customer_agg['frequency'] = customer_agg['total_orders']
customer_agg['monetary'] = customer_agg['total_revenue']

In [77]:
# Recent activity in different time windows
print("Calculating recent activity (last 30/60/90 days)...")
for window in [30, 60, 90]:
    mask = sales['date'] >= (reference_date - pd.Timedelta(days=window))
    recent = sales[mask].groupby('customer_id').agg({
        'order_id': 'nunique',
        'revenue': 'sum',
        'quantity': 'sum'
    }).rename(columns={
        'order_id': f'orders_last_{window}d',
        'revenue': f'revenue_last_{window}d',
        'quantity': f'items_last_{window}d'
    })
    customer_agg = customer_agg.merge(recent, on='customer_id', how='left')

# Fill NaN with 0 (customers with no recent activity)
for col in customer_agg.columns:
    if 'last_' in col:
        customer_agg[col].fillna(0, inplace=True)

Calculating recent activity (last 30/60/90 days)...


In [78]:
print(f"Average customer lifetime: {customer_agg['customer_lifetime_days'].mean():.1f} days")
print(f"Average recency: {customer_agg['recency_days'].mean():.1f} days")
print(f"Active in last 30 days: {(customer_agg['orders_last_30d'] > 0).mean():.1%}")

Average customer lifetime: 30.8 days
Average recency: 34.2 days
Active in last 30 days: 54.0%


These features are essential for time-series based predictions and understanding customer lifecycle stages

- recency_days: Single best predictor of churn - customers not buying for 60+ days are likely churned
- orders_last_30/60/90d: Captures engagement trajectory - declining recent orders signals churn risk
- customer_lifetime_days: Differentiates new vs. established customers for better segmentation

In [79]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 38)


# Purchase Pattern Analysis

Calculate the regularity and predictability of customer purchases. Some customers buy weekly like clockwork, others sporadically. This helps identify habitual vs. occasional buyers.

In [80]:
# Get order dates for each customer
order_dates = sales.groupby(['customer_id', 'order_id'])['date'].first().reset_index()
order_dates = order_dates.sort_values(['customer_id', 'date'])

# Calculate days between consecutive orders (vectorized for speed)
order_dates['days_since_prev_order'] = (
    order_dates.groupby('customer_id')['date'].diff().dt.days
)

# Aggregate gap statistics
gap_stats = order_dates.groupby('customer_id')['days_since_prev_order'].agg([
    ('avg_days_between_orders', 'mean'),
    ('std_days_between_orders', 'std'),
    ('min_days_between_orders', 'min'),
    ('max_days_between_orders', 'max')
]).reset_index()

customer_agg = customer_agg.merge(gap_stats, on='customer_id', how='left')

# Fill NaN for single-order customers (no gaps to calculate)
customer_agg['avg_days_between_orders'].fillna(999, inplace=True)  # 999 indicates single purchase
customer_agg['std_days_between_orders'].fillna(0, inplace=True)
customer_agg['min_days_between_orders'].fillna(0, inplace=True)
customer_agg['max_days_between_orders'].fillna(0, inplace=True)

In [81]:
print(f"Median days between orders: {customer_agg[customer_agg['avg_days_between_orders'] < 999]['avg_days_between_orders'].median():.1f}")
print(f"Regular buyers (std < 10 days): {(customer_agg['std_days_between_orders'] < 10).mean():.1%}")

Median days between orders: 9.0
Regular buyers (std < 10 days): 83.3%


These patterns are crucial for recommendation timing and churn prediction models

- avg_days_between_orders: Identifies purchase cycle - useful for timing retention campaigns
- std_days_between_orders: Low std = regular buyer (predictable), high std = sporadic buyer
- max_days_between_orders: Helps set personalized churn thresholds per customer

In [82]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 42)


# Product and Category Intelligence

Merge sales with product information to understand WHAT customers buy - their category preferences, brand loyalties, and product diversity. This is essential for personalized recommendations.

In [83]:
# Merge sales with product details
sales_prod = sales.merge(
    products[['product_id', 'brand', 'category_l0', 'category_l1', 'category_l2']], 
    on='product_id', 
    how='left'
)

# Handle missing values
sales_prod['brand'].fillna('Unknown', inplace=True)
for col in ['category_l0', 'category_l1', 'category_l2']:
    sales_prod[col].fillna('Unknown', inplace=True)

In [84]:
# Calculate category and brand diversity
print("Calculating category and brand diversity...")
diversity_stats = sales_prod.groupby('customer_id').agg({
    'brand': 'nunique',
    'category_l0': 'nunique',
    'category_l1': 'nunique',
    'category_l2': 'nunique'
}).rename(columns={
    'brand': 'unique_brands',
    'category_l0': 'unique_l0_categories',
    'category_l1': 'unique_l1_categories',
    'category_l2': 'unique_l2_categories'
})

customer_agg = customer_agg.merge(diversity_stats, on='customer_id', how='left')

Calculating category and brand diversity...


In [85]:
print(f"Avg unique brands per customer: {customer_agg['unique_brands'].mean():.1f}")
print(f"Customers buying from single category: {(customer_agg['unique_l1_categories'] == 1).mean():.1%}")

Avg unique brands per customer: 10.8
Customers buying from single category: 15.8%


Essential for cross-sell strategies and understanding customer shopping behavior. Variety seekers and focused buyers have different churn patterns and require different retention strategies
- unique_brands: High value = variety seeker, low = brand loyal
- unique_categories: Indicates if customer has diverse needs or focused purchases

In [86]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 46)


# Top Brand and Category Preferences

Identify the top 3 brands and categories for each customer. Instead of just counting diversity, we identify specific preferences. This is crucial for personalized product recommendations.

In [87]:
# TOP 3 BRANDS
print("Calculating top 3 brand preferences...")
brand_counts = (sales_prod.groupby(['customer_id', 'brand'])
                .size()
                .reset_index(name='count'))

brand_counts['rank'] = (brand_counts.groupby('customer_id')['count']
                        .rank(method='dense', ascending=False))

top_brands = brand_counts[brand_counts['rank'] <= 3].pivot_table(
    index='customer_id',
    columns='rank',
    values='brand',
    aggfunc='first'
).reset_index()

top_brands.columns = ['customer_id'] + [f'top_brand_{int(i)}' for i in top_brands.columns[1:]]

Calculating top 3 brand preferences...


In [88]:
# TOP 3 CATEGORIES (L1 level - most useful)
print("Calculating top 3 category preferences...")
cat_counts = (sales_prod.groupby(['customer_id', 'category_l1'])
             .size()
             .reset_index(name='count'))

cat_counts['rank'] = (cat_counts.groupby('customer_id')['count']
                      .rank(method='dense', ascending=False))

top_cats = cat_counts[cat_counts['rank'] <= 3].pivot_table(
    index='customer_id',
    columns='rank',
    values='category_l1',
    aggfunc='first'
).reset_index()

top_cats.columns = ['customer_id'] + [f'top_category_{int(i)}' for i in top_cats.columns[1:]]

Calculating top 3 category preferences...


In [89]:
# TOP 3 PRODUCTS
print("Calculating top 3 product preferences...")
product_counts = (sales_prod.groupby(['customer_id', 'product_id'])
                 .agg({'quantity': 'sum'})
                 .reset_index())

product_counts['rank'] = (product_counts.groupby('customer_id')['quantity']
                          .rank(method='dense', ascending=False))

top_products = product_counts[product_counts['rank'] <= 3].pivot_table(
    index='customer_id',
    columns='rank',
    values='product_id',
    aggfunc='first'
).reset_index()

top_products.columns = ['customer_id'] + [f'top_product_{int(i)}' for i in top_products.columns[1:]]

Calculating top 3 product preferences...


In [90]:
# Merge all preferences
customer_agg = customer_agg.merge(top_brands, on='customer_id', how='left')
customer_agg = customer_agg.merge(top_cats, on='customer_id', how='left')
customer_agg = customer_agg.merge(top_products, on='customer_id', how='left')

# Fill missing values
for col in customer_agg.columns:
    if 'top_brand' in col or 'top_category' in col:
        customer_agg[col].fillna('Unknown', inplace=True)
    elif 'top_product' in col:
        customer_agg[col].fillna(-1, inplace=True)

In [91]:
print(f"Most popular brand: {customer_agg['top_brand_1'].value_counts().head(1).index[0]}")
print(f"Most popular category: {customer_agg['top_category_1'].value_counts().head(1).index[0]}")

Most popular brand: Unknown
Most popular category: Fresh Vegetables


Having top 3 instead of just top 1 captures backup preferences when primary choice is unavailable
- top_brand_1/2/3: Enables brand-based recommendations and identifies brand loyalty
- top_category_1/2/3: Critical for cross-category recommendations
- top_product_1/2/3: Identifies repeat purchase products for replenishment reminders

In [92]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 55)


# Loyalty and Concentration Scores

Calculate how loyal or focused customers are to their favorite brands/categories. A customer buying 80% from one brand is very different from one spreading purchases across 10 brands.

In [93]:
# Brand loyalty score (% of purchases from top brand)
total_brand_purchases = brand_counts.groupby('customer_id')['count'].sum()

brand_loyalty = (
    brand_counts.sort_values(['customer_id', 'count'], ascending=[True, False])
    .drop_duplicates(subset=['customer_id'], keep='first')  # ensure 1 row per customer
    .set_index('customer_id')
)

brand_loyalty['brand_loyalty_score'] = (
    brand_loyalty['count'] / total_brand_purchases * 100
)

brand_loyalty = brand_loyalty[['brand_loyalty_score']].reset_index()

In [94]:
# Category focus score (% from top category)
total_cat_purchases = cat_counts.groupby('customer_id')['count'].sum()

cat_focus = (
    cat_counts.sort_values(['customer_id', 'count'], ascending=[True, False])
    .drop_duplicates(subset=['customer_id'], keep='first')  # ensure 1 row per customer
    .set_index('customer_id')
)

cat_focus['category_focus_score'] = (
    cat_focus['count'] / total_cat_purchases * 100
)

cat_focus = cat_focus[['category_focus_score']].reset_index()


In [95]:
# Product loyalty (how often they repeat products)
product_repeat = (
    sales_prod.groupby('customer_id')['product_id']
    .apply(lambda x: (x.value_counts() > 1).sum() / x.nunique() * 100 if x.nunique() > 0 else 0)
    .reset_index(name='product_repeat_rate')
)


In [96]:
# Merge loyalty scores
customer_agg = customer_agg.merge(brand_loyalty, on='customer_id', how='left')
customer_agg = customer_agg.merge(cat_focus, on='customer_id', how='left')
customer_agg = customer_agg.merge(product_repeat, on='customer_id', how='left')

# Fill NaN with 0
for col in ['brand_loyalty_score', 'category_focus_score', 'product_repeat_rate']:
    customer_agg[col].fillna(0, inplace=True)


In [97]:
print(f"Avg brand loyalty: {customer_agg['brand_loyalty_score'].mean():.1f}%")
print(f"Highly loyal customers (>80%): {(customer_agg['brand_loyalty_score'] > 80).mean():.1%}")
print(f"Variety seekers (<30%): {(customer_agg['brand_loyalty_score'] < 30).mean():.1%}")

Avg brand loyalty: 45.2%
Highly loyal customers (>80%): 17.2%
Variety seekers (<30%): 38.5%


These scores are crucial for segmentation and personalization strategies
- brand_loyalty_score: High loyalty customers need different retention strategies than variety seekers
- category_focus_score: Focused buyers are easier to predict and recommend to
- product_repeat_rate: High repeat rate indicates habitual buying - good for subscription models

In [98]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 58)


# Financial Ratios and Metrics

Calculate profitability ratios and discount usage patterns. Understanding price sensitivity and profit contribution helps prioritize which customers to retain.

In [99]:
# Profit margin per customer
customer_agg['profit_margin'] = (
    customer_agg['total_profit'] / customer_agg['total_revenue'].replace(0, np.nan)
) * 100

# Discount usage ratio
customer_agg['discount_ratio'] = (
    customer_agg['total_discount'] / customer_agg['total_revenue'].replace(0, np.nan)
) * 100

In [100]:
# Fill NaN with 0
customer_agg['profit_margin'].fillna(0, inplace=True)
customer_agg['discount_ratio'].fillna(0, inplace=True)

In [101]:
# Price segment classification
def assign_price_segment(price):
    if price < 50: return 'budget'
    elif price < 150: return 'mid_range'
    elif price < 500: return 'premium'
    else: return 'luxury'

customer_agg['price_segment'] = customer_agg['avg_price_point'].apply(assign_price_segment)

In [102]:
print(f"Average profit margin: {customer_agg['profit_margin'].mean():.1f}%")
print(f"Customers never using discounts: {(customer_agg['discount_ratio'] == 0).mean():.1%}")
print("\nPrice segment distribution:")
print(customer_agg['price_segment'].value_counts())

Average profit margin: 6.0%
Customers never using discounts: 85.0%

Price segment distribution:
price_segment
mid_range    1197899
budget        401799
premium       371896
luxury         14980
Name: count, dtype: int64


Critical for ROI-based retention strategies and pricing optimization
- profit_margin: Identifies most valuable customers to retain
- discount_ratio: High ratio = price-sensitive, needs discounts to purchase
- price_segment: Helps target right products at right price points

In [103]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 61)


#  Discount Behavior Analysis

Analyze how customers respond to discounts. Some customers only buy on sale, others buy regardless. This helps design effective promotional strategies.

In [104]:
# Calculate discount percentage for each transaction
sales_prod['discount_pct'] = (
    sales_prod['discount'] / sales_prod['revenue'].replace(0, np.nan)
) * 100

# Analyze purchase patterns at different discount levels
discount_patterns = sales_prod.groupby('customer_id').apply(
    lambda x: pd.Series({
        'no_discount_purchase_pct': (x['discount_pct'].fillna(0) == 0).mean() * 100,
        'small_discount_pct': ((x['discount_pct'] > 0) & (x['discount_pct'] <= 10)).mean() * 100,
        'medium_discount_pct': ((x['discount_pct'] > 10) & (x['discount_pct'] <= 20)).mean() * 100,
        'high_discount_pct': (x['discount_pct'] > 20).mean() * 100,
        'avg_discount_needed': x[x['discount_pct'] > 0]['discount_pct'].mean() if (x['discount_pct'] > 0).any() else 0,
        'discount_sensitivity': x['discount_pct'].std() if len(x) > 1 else 0
    })
).reset_index()

In [105]:
customer_agg = customer_agg.merge(discount_patterns, on='customer_id', how='left')

# Fill NaN
for col in discount_patterns.columns:
    if col != 'customer_id':
        customer_agg[col].fillna(0, inplace=True)

In [106]:
print(f"Pure full-price buyers (never use discount): {(customer_agg['no_discount_purchase_pct'] == 100).mean():.1%}")
print(f"Discount dependent (>50% purchases with discount): {(customer_agg['no_discount_purchase_pct'] < 50).mean():.1%}")
print(f"Average discount needed to trigger purchase: {customer_agg['avg_discount_needed'].mean():.1f}%")

Pure full-price buyers (never use discount): 85.0%
Discount dependent (>50% purchases with discount): 10.2%
Average discount needed to trigger purchase: 13.5%


Essential for designing personalized promotions and maximizing profit margins
- no_discount_purchase_pct: Identifies price-insensitive customers (most profitable)
- high_discount_pct: Flags bargain hunters who wait for sales
- discount_sensitivity: High variability suggests strategic buying behavior

In [107]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 67)


# Cross-Category Shopping Patterns

Analyze how customers shop across categories. Do they buy from multiple categories in one order (one-stop shoppers) or focused purchases? This reveals shopping mission types.

In [108]:
# Calculate categories per order
order_complexity = sales_prod.groupby(['customer_id', 'order_id']).agg({
    'category_l1': 'nunique',
    'brand': 'nunique',
    'product_id': 'nunique',
    'quantity': 'sum'
}).reset_index()

In [109]:
# Aggregate to customer level
cross_category = order_complexity.groupby('customer_id').agg({
    'category_l1': ['mean', 'max'],
    'brand': ['mean', 'max'],
    'product_id': ['mean', 'max'],
    'quantity': ['mean', 'max']
}).reset_index()

cross_category.columns = ['customer_id', 
                          'avg_categories_per_order', 'max_categories_per_order',
                          'avg_brands_per_order', 'max_brands_per_order',
                          'avg_products_per_order', 'max_products_per_order',
                          'avg_items_per_order', 'max_items_per_order']

In [110]:
# Calculate percentage of multi-category orders
multi_cat = order_complexity.groupby('customer_id').apply(
    lambda x: pd.Series({
        'single_category_orders_pct': (x['category_l1'] == 1).mean() * 100,
        'multi_category_orders_pct': (x['category_l1'] > 1).mean() * 100,
        'bulk_orders_pct': (x['quantity'] > 10).mean() * 100
    })
).reset_index()

In [111]:
# Merge
customer_agg = customer_agg.merge(cross_category, on='customer_id', how='left')
customer_agg = customer_agg.merge(multi_cat, on='customer_id', how='left')

In [112]:
print(f"Avg categories per order: {customer_agg['avg_categories_per_order'].mean():.2f}")
print(f"One-stop shoppers (>50% multi-category): {(customer_agg['multi_category_orders_pct'] > 50).mean():.1%}")
print(f"Bulk buyers (>50% bulk orders): {(customer_agg['bulk_orders_pct'] > 50).mean():.1%}")

Avg categories per order: 3.22
One-stop shoppers (>50% multi-category): 69.0%
Bulk buyers (>50% bulk orders): 7.8%


Critical for basket optimization and cross-category recommendations
- avg_categories_per_order: High value = one-stop shopper, good for cross-sell
- multi_category_orders_pct: Indicates convenience shoppers vs. focused buyers
- bulk_orders_pct: Identifies wholesale/business customers vs. retail

In [113]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 78)


# Price Range Preferences

Identify which price ranges customers are comfortable with. Some customers consistently buy premium, others hunt for bargains. This helps price product recommendations appropriately.

In [114]:
# Create price range buckets
sales_prod['price_range'] = pd.cut(
    sales_prod['unit_price'],
    bins=[0, 50, 100, 250, 500, 1000, float('inf')],
    labels=['0-50', '50-100', '100-250', '250-500', '500-1000', '1000+']
)

In [115]:
# Calculate top 3 price ranges per customer
price_counts = (sales_prod.groupby(['customer_id', 'price_range'])
               .size()
               .reset_index(name='count'))

price_counts['rank'] = (price_counts.groupby('customer_id')['count']
                        .rank(method='dense', ascending=False))

top_price_ranges = price_counts[price_counts['rank'] <= 3].pivot_table(
    index='customer_id',
    columns='rank',
    values='price_range',
    aggfunc='first'
).reset_index()

top_price_ranges.columns = ['customer_id'] + [f'top_price_range_{int(i)}' for i in top_price_ranges.columns[1:]]

In [116]:
# Price volatility (how much price varies)
price_consistency = sales_prod.groupby('customer_id')['unit_price'].agg([
    ('price_variance', 'var'),
    ('price_cv', lambda x: x.std() / x.mean() if x.mean() > 0 else 0)  # Coefficient of variation
]).reset_index()

In [117]:
# Merge
customer_agg = customer_agg.merge(top_price_ranges, on='customer_id', how='left')
customer_agg = customer_agg.merge(price_consistency, on='customer_id', how='left')

# Fill missing
for col in customer_agg.columns:
    if 'top_price_range' in col:
        # Convert categorical to string type before filling
        customer_agg[col] = customer_agg[col].astype('object')
        customer_agg[col].fillna('Unknown', inplace=True)

customer_agg['price_variance'].fillna(0, inplace=True)
customer_agg['price_cv'].fillna(0, inplace=True)

In [118]:
print(f"Most common price range: {customer_agg['top_price_range_1'].value_counts().head(1).index[0]}")
print(f"Price consistent customers (CV < 0.5): {(customer_agg['price_cv'] < 0.5).mean():.1%}")

Most common price range: 0-50
Price consistent customers (CV < 0.5): 28.6%


Critical for price optimization in recommendations

- top_price_range_1/2/3: Ensures recommendations match customer's budget
- price_variance: Low variance = predictable spending, high = varies by need
- price_cv: Identifies price-experimental customers vs. consistent spenders


In [119]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 83)


# Purchase Velocity and Trends

Calculate how customer behavior changes over time. Are they buying more frequently or slowing down? This is crucial for early churn detection.

In [120]:
# Purchase velocity (orders per day)
customer_agg['purchase_velocity'] = (
    customer_agg['total_orders'] / (customer_agg['customer_lifetime_days'] + 1)
)

# Purchase acceleration (comparing first vs last period)
def calculate_purchase_trend(customer_data):
    """Compare purchase frequency in first half vs second half of customer lifetime"""
    customer_data = customer_data.sort_values('date')
    
    if len(customer_data) < 3:  # Need minimum data
        return 0
    
    mid_point = len(customer_data) // 2
    first_half = customer_data.iloc[:mid_point]
    second_half = customer_data.iloc[mid_point:]
    
    # Orders in each half
    first_orders = first_half['order_id'].nunique()
    second_orders = second_half['order_id'].nunique()
    
    # Days in each period
    first_days = (first_half['date'].max() - first_half['date'].min()).days + 1
    second_days = (second_half['date'].max() - second_half['date'].min()).days + 1
    
    if first_days > 0 and second_days > 0:
        first_velocity = first_orders / first_days
        second_velocity = second_orders / second_days
        
        # Calculate trend
        if second_velocity > first_velocity * 1.2:
            return 1  # Accelerating
        elif second_velocity < first_velocity * 0.8:
            return -1  # Declining
    
    return 0  # Stable

In [121]:
# Apply trend calculation
print("Computing trends (this may take a minute)...")
purchase_trends = sales.groupby('customer_id').apply(calculate_purchase_trend).reset_index()
purchase_trends.columns = ['customer_id', 'purchase_trend']

# Revenue trend (is spend increasing or decreasing)
revenue_trends = sales.groupby('customer_id').apply(
    lambda x: 1 if x.iloc[len(x)//2:]['revenue'].sum() > x.iloc[:len(x)//2]['revenue'].sum() else -1
).reset_index(name='revenue_trend')

Computing trends (this may take a minute)...


In [122]:
# Merge trends
customer_agg = customer_agg.merge(purchase_trends, on='customer_id', how='left')
customer_agg = customer_agg.merge(revenue_trends, on='customer_id', how='left')

customer_agg['purchase_trend'].fillna(0, inplace=True)
customer_agg['revenue_trend'].fillna(0, inplace=True)

In [123]:
print(f"Accelerating customers: {(customer_agg['purchase_trend'] == 1).mean():.1%}")
print(f"Declining customers: {(customer_agg['purchase_trend'] == -1).mean():.1%}")
print(f"Stable customers: {(customer_agg['purchase_trend'] == 0).mean():.1%}")

Accelerating customers: 17.6%
Declining customers: 31.4%
Stable customers: 51.1%


These are among the strongest predictors for churn models

- purchase_velocity: Fast velocity = engaged customer
- purchase_trend: Declining trend is early warning sign of churn
- revenue_trend: Even if frequency stable, declining spend indicates dissatisfaction

In [124]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 86)


# RFM Scoring System

Create standardized RFM (Recency, Frequency, Monetary) scores on a 1-5 scale. This classic framework enables customer segmentation and is widely used in retail analytics.

In [125]:
# Recency: Lower is better (5 = most recent)
customer_agg['recency_score'] = pd.qcut(
    customer_agg['recency_days'], 
    q=5, 
    labels=[5, 4, 3, 2, 1],
    duplicates='drop'
).astype(int)

In [126]:
# Frequency: Higher is better (5 = most frequent)
customer_agg['frequency_score'] = pd.qcut(
    customer_agg['frequency'].rank(method='first'), 
    q=5,
    labels=[1, 2, 3, 4, 5],
    duplicates='drop'
).astype(int)

In [127]:
# Monetary: Higher is better (5 = highest value)
customer_agg['monetary_score'] = pd.qcut(
    customer_agg['monetary'].rank(method='first'), 
    q=5,
    labels=[1, 2, 3, 4, 5],
    duplicates='drop'
).astype(int)


In [128]:
# Combined RFM score (3-15 range)
customer_agg['rfm_score'] = (
    customer_agg['recency_score'] + 
    customer_agg['frequency_score'] + 
    customer_agg['monetary_score']
)

# RFM segment code (e.g., "555" = best customers)
customer_agg['rfm_segment_code'] = (
    customer_agg['recency_score'].astype(str) +
    customer_agg['frequency_score'].astype(str) +
    customer_agg['monetary_score'].astype(str)
)

In [129]:
print(f"Champions (555): {(customer_agg['rfm_segment_code'] == '555').sum():,} customers")
print(f"At Risk (255): {(customer_agg['rfm_segment_code'] == '255').sum():,} customers")
print(f"Lost (111): {(customer_agg['rfm_segment_code'] == '111').sum():,} customers")
print(f"\nRFM Score distribution:")
print(customer_agg['rfm_score'].value_counts().sort_index())

Champions (555): 152,109 customers
At Risk (255): 9,097 customers
Lost (111): 83,243 customers

RFM Score distribution:
rfm_score
3      83243
4     130447
5     188488
6     195490
7     177404
8     173908
9     157416
10    149626
11    149617
12    146177
13    141132
14    141517
15    152109
Name: count, dtype: int64


Champions (555) need different treatment than At-Risk (255) customers. Foundation for customer segmentation strategies.

- RFM scores: Industry-standard framework for customer value assessment
- rfm_segment_code: Enables creation of 125 micro-segments for targeted marketing

In [130]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 91)


# Advanced Behavioral Scores

Create composite scores that combine multiple features into meaningful behavioral indicators. These help quickly identify customer types for business actions.

In [131]:
# Customer Lifetime Value (projected annual)
customer_agg['clv_predicted'] = (
    customer_agg['total_revenue'] / 
    (customer_agg['customer_lifetime_days'] + 1) * 365
)

# Product diversity score
customer_agg['product_diversity_score'] = (
    customer_agg['unique_products'] / customer_agg['total_orders']
)

In [132]:
# Bargain hunter score (0-100)
median_price = customer_agg['avg_price_point'].median()
customer_agg['bargain_hunter_score'] = (
    ((customer_agg['avg_price_point'] < median_price) * 30) +  # Below median price
    ((customer_agg['high_discount_pct'] > 30) * 40) +  # Often uses high discounts
    ((customer_agg['no_discount_purchase_pct'] < 20) * 30)  # Rarely buys without discount
)

In [133]:
# Premium buyer score (0-100)
customer_agg['premium_buyer_score'] = (
    ((customer_agg['avg_price_point'] > median_price * 2) * 40) +  # High price point
    ((customer_agg['price_segment'] == 'luxury') * 30) +
    ((customer_agg['no_discount_purchase_pct'] > 80) * 30)  # Rarely uses discounts
)

In [134]:
# Engagement score (0-100)
customer_agg['engagement_score'] = (
    ((customer_agg['orders_last_30d'] > 0) * 30) +  # Recent activity
    ((customer_agg['unique_l1_categories'] > customer_agg['unique_l1_categories'].median()) * 20) +  # Diverse interests
    ((customer_agg['purchase_trend'] == 1) * 30) +  # Accelerating
    ((customer_agg['multi_category_orders_pct'] > 50) * 20)  # Cross-category shopping
)

In [135]:
print(f"Average predicted CLV: ${customer_agg['clv_predicted'].mean():.2f}")
print(f"Bargain hunters (score > 70): {(customer_agg['bargain_hunter_score'] > 70).mean():.1%}")
print(f"Premium buyers (score > 70): {(customer_agg['premium_buyer_score'] > 70).mean():.1%}")
print(f"Highly engaged (score > 70): {(customer_agg['engagement_score'] > 70).mean():.1%}")

Average predicted CLV: $81680.65
Bargain hunters (score > 70): 2.2%
Premium buyers (score > 70): 0.6%
Highly engaged (score > 70): 10.4%


- clv_predicted: Prioritize retention efforts on high-CLV customers
- bargain_hunter_score: These customers need discounts to convert
- premium_buyer_score: Can market high-margin products to these customers
- engagement_score: Composite metric for overall customer health

In [136]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 96)


# Churn Risk Indicators

Create churn labels and risk scores. This is our target variable for supervised learning and helps prioritize retention efforts.

In [137]:
# Binary churn label (60-day threshold)
CHURN_THRESHOLD = 60
customer_agg['is_churned'] = (
    customer_agg['recency_days'] > CHURN_THRESHOLD
).astype(int)

# Churn risk score (0-1 continuous)
customer_agg['churn_risk_score'] = (
    customer_agg['recency_days'] / CHURN_THRESHOLD
).clip(0, 1)

In [138]:
# Churn risk category
def categorize_churn_risk(score):
    if score <= 0.3: return 'low_risk'
    elif score <= 0.7: return 'medium_risk'
    else: return 'high_risk'

customer_agg['churn_risk_category'] = customer_agg['churn_risk_score'].apply(categorize_churn_risk)

In [139]:
# Early warning indicators
customer_agg['showing_decline'] = (
    (customer_agg['purchase_trend'] == -1) & 
    (customer_agg['recency_days'] > 30)
).astype(int)

customer_agg['at_risk'] = (
    (customer_agg['churn_risk_score'] > 0.5) & 
    (customer_agg['is_churned'] == 0)  # Not yet churned but risky
).astype(int)

In [140]:
print(f"Churn rate: {customer_agg['is_churned'].mean():.1%}")
print(f"At risk (not churned but high risk): {customer_agg['at_risk'].mean():.1%}")
print("\nChurn risk distribution:")
print(customer_agg['churn_risk_category'].value_counts())
print(f"\nCustomers showing decline: {customer_agg['showing_decline'].mean():.1%}")

Churn rate: 24.0%
At risk (not churned but high risk): 22.0%

Churn risk distribution:
churn_risk_category
low_risk       728858
high_risk      715811
medium_risk    541905
Name: count, dtype: int64

Customers showing decline: 9.2%


These enable proactive retention before customers actually churn

- is_churned: Target variable for churn prediction models
- churn_risk_score: Continuous measure allows probability-based interventions
- at_risk: Identifies customers needing immediate attention

In [141]:
print(f"Shape: {customer_agg.shape}")

Shape: (1986574, 101)


# Customer Segmentation Labels

Create business-friendly customer segments based on multiple factors. These segments are actionable for marketing teams.

In [142]:
# Binary flags for segmentation
customer_agg['is_one_time_buyer'] = (customer_agg['total_orders'] == 1).astype(int)
customer_agg['is_frequent_buyer'] = (customer_agg['total_orders'] >= 10).astype(int)
customer_agg['is_high_value'] = (
    customer_agg['total_revenue'] >= customer_agg['total_revenue'].quantile(0.75)
).astype(int)
customer_agg['is_discount_dependent'] = (customer_agg['no_discount_purchase_pct'] < 30).astype(int)
customer_agg['is_active'] = (customer_agg['recency_days'] <= 30).astype(int)

In [143]:
# Create named segments
def assign_customer_segment(row):
    if row['is_churned']:
        return 'Lost'
    elif row['is_one_time_buyer'] and row['recency_days'] < 30:
        return 'New'
    elif row['is_one_time_buyer']:
        return 'One-Time'
    elif row['is_high_value'] and row['is_frequent_buyer'] and row['is_active']:
        return 'Champions'
    elif row['is_high_value'] and row['is_active']:
        return 'Big Spenders'
    elif row['is_frequent_buyer'] and row['is_active']:
        return 'Loyal'
    elif row['at_risk'] and row['is_high_value']:
        return 'At Risk - High Value'
    elif row['at_risk']:
        return 'At Risk'
    elif row['is_active']:
        return 'Promising'
    else:
        return 'Hibernating'

In [144]:
customer_agg['customer_segment'] = customer_agg.apply(assign_customer_segment, axis=1)

In [145]:
print("Segment distribution:")
print(customer_agg['customer_segment'].value_counts())
print(f"\nOne-time buyers: {customer_agg['is_one_time_buyer'].mean():.1%}")
print(f"High-value customers: {customer_agg['is_high_value'].mean():.1%}")
print(f"Discount dependent: {customer_agg['is_discount_dependent'].mean():.1%}")

Segment distribution:
customer_segment
Lost                    476870
Promising               387513
Champions               242352
New                     235705
One-Time                202942
At Risk                 188460
Big Spenders            172022
At Risk - High Value     53231
Loyal                    27479
Name: count, dtype: int64

One-time buyers: 38.3%
High-value customers: 25.0%
Discount dependent: 9.2%


Each segment needs different marketing strategies and business treatment

- customer_segment: Actionable labels for marketing campaigns
- is_one_time_buyer: 38% of customers - need activation strategies
- is_high_value: Top 25% by revenue - prioritize for retention

# Feature Summary and Save

Summarize all features created, check data quality, and save the final dataset for modeling.

In [146]:
# Data quality check
print(f"Final dataset statistics:")
print(f"Shape: {customer_agg.shape}")
print(f"Memory usage: {customer_agg.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Null values: {customer_agg.isnull().sum().sum()}")
print(f"Duplicate customers: {customer_agg['customer_id'].duplicated().sum()}")

Final dataset statistics:
Shape: (1986574, 107)
Memory usage: 2821.14 MB
Null values: 0
Duplicate customers: 0


In [147]:
# Key business metrics
print(f"Business metrics:")
print(f"Total customers: {len(customer_agg):,}")
print(f"Churn rate: {customer_agg['is_churned'].mean():.1%}")
print(f"One-time buyers: {customer_agg['is_one_time_buyer'].mean():.1%}")
print(f"Average CLV: ${customer_agg['clv_predicted'].mean():,.2f}")
print(f"High-risk customers: {(customer_agg['churn_risk_category'] == 'high_risk').mean():.1%}")


Business metrics:
Total customers: 1,986,574
Churn rate: 24.0%
One-time buyers: 38.3%
Average CLV: $81,680.65
High-risk customers: 36.0%


In [148]:
print("Saving enhanced customer features...")
customer_agg.to_csv('customer_features_complete.csv', index=False)

Saving enhanced customer features...


In [149]:
pd.set_option('display.max_columns', None)
print(customer_agg.head(20))

    customer_id  total_orders  unique_products  total_quantity  \
0           189            32               93             208   
1           224             1                6               7   
2           234            91              230             686   
3           249            13               36              83   
4           257            10               20              27   
5           259             3                3               3   
6           263            43              101             154   
7           272             6               15              18   
8           275             8               17              37   
9           276             1                3               5   
10          279             1                1               1   
11          281             9               36              48   
12          298            79              159             366   
13          313             3                4               6   
14        