In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from typing import Dict, Tuple, List

In [2]:
customers = pd.read_csv('/kaggle/input/e-commerce-event-data/seph_customers.csv')
transactions = pd.read_csv('/kaggle/input/e-commerce-event-data/seph_transactions.csv')

In [3]:
customers.drop(columns=['Unnamed: 0'], inplace=True)
transactions.drop(columns=['Unnamed: 0'], inplace=True)

In [4]:
# Product hierarchy
product_hierarchy = {
    'Skincare': {
        'Moisturizers': ['Day Cream', 'Night Cream', 'Eye Cream'],
        'Cleansers': ['Foam Cleanser', 'Oil Cleanser', 'Micellar Water'],
        'Treatments': ['Vitamin C Serum', 'Retinol', 'Hyaluronic Acid']
    },
    'Makeup': {
        'Face': ['Foundation', 'Concealer', 'Blush', 'Bronzer'],
        'Eyes': ['Mascara', 'Eyeshadow', 'Eyeliner'],
        'Lips': ['Lipstick', 'Lip Gloss', 'Lip Liner']
    },
    'Fragrance': {
        'Perfume': ['Eau de Parfum', 'Eau de Toilette'],
        'Sets': ['Gift Set', 'Travel Set']
    },
    'Haircare': {
        'Shampoo': ['Regular', 'Dry', 'Oily'],
        'Conditioner': ['Regular', 'Deep', 'Leave-in'],
        'Styling': ['Hair Oil', 'Heat Protectant', 'Hair Spray']
    }
}

# Add product level 1 and product level 2 columns
def assign_product_details(category):
    """Assign product-level details based on category."""
    # Select a random product level 1 (subcategory)
    product_level_1 = random.choice(list(product_hierarchy[category].keys()))
    # Select a random product level 2 (actual product) within the chosen subcategory
    product_level_2 = random.choice(product_hierarchy[category][product_level_1])
    return product_level_1, product_level_2

# Apply the function to the transactions dataframe
transactions[['product_level_1', 'product_level_2']] = transactions['category'].apply(
    lambda x: pd.Series(assign_product_details(x))
)

# Display the updated transactions dataframe
transactions

Unnamed: 0,transaction_id,customer_id,date,category,quantity,unit_price,total_amount,product_level_1,product_level_2
0,TXN_357806,CUST_000000,2024-12-10,Skincare,8,48.527557,388.220452,Cleansers,Micellar Water
1,TXN_890844,CUST_000000,2024-01-20,Fragrance,4,61.553292,246.213169,Sets,Gift Set
2,TXN_862276,CUST_000000,2024-08-16,Haircare,8,37.358774,298.870189,Shampoo,Regular
3,TXN_110665,CUST_000000,2024-09-20,Skincare,4,43.358752,173.435010,Cleansers,Micellar Water
4,TXN_379781,CUST_000000,2024-08-22,Haircare,4,23.248625,92.994500,Styling,Hair Spray
...,...,...,...,...,...,...,...,...,...
26381,TXN_778123,CUST_000999,2024-02-26,Skincare,4,52.116852,208.467409,Cleansers,Oil Cleanser
26382,TXN_542317,CUST_000999,2024-06-21,Haircare,6,49.535322,297.211932,Styling,Heat Protectant
26383,TXN_054181,CUST_000999,2024-07-06,Haircare,3,40.263499,120.790496,Shampoo,Regular
26384,TXN_717607,CUST_000999,2024-04-04,Makeup,4,34.409350,137.637400,Lips,Lipstick


In [5]:
class CustomerAnalyticsSuite:
    def __init__(self):
        """Initialize the analytics suite with required models and transformers."""
        self.scaler = StandardScaler()
        self.ltv_model = GradientBoostingRegressor(random_state=42)
        self.churn_model = RandomForestClassifier(random_state=42)
        
    def prepare_customer_features(self, 
                                customers: pd.DataFrame, 
                                transactions: pd.DataFrame) -> pd.DataFrame:
        """Create feature-rich customer profiles from raw data."""

        # Calculate customer engagement metrics
        engagement_features = self._calculate_engagement_metrics(customers, transactions)
        
        # Calculate transaction-based features
        transaction_features = self._calculate_transaction_features(transactions)
        
        # Merge all features
        features = customers.merge(
            engagement_features,
            on='customer_id',
            how='left'
        ).merge(
            transaction_features,
            on='customer_id',
            how='left'
        )
        
        # Fill missing values for customers with no transactions
        features = features.fillna({
            'avg_transaction_value': 0,
            'purchase_frequency': 0,
            'days_since_last_purchase': 365,  # Assume max days
            'category_diversity': 0,
            'premium_product_ratio': 0
        })
        
        return features

    def _calculate_engagement_metrics(self, 
                                   customers: pd.DataFrame,
                                   transactions: pd.DataFrame) -> pd.DataFrame:
        """Calculate customer engagement metrics."""
        
        # Category diversity
        category_diversity = (
            transactions.groupby('customer_id')['category']
            .agg(lambda x: len(set(x)))
            .reset_index()
            .rename(columns={'category': 'category_diversity'})
        )
        
        # Premium product ratio
        premium_products = (
            transactions.groupby('customer_id')['product_level_2']
            .agg(lambda x: np.mean(np.array(x) == 'Luxury'))
            .reset_index()
            .rename(columns={'product_level_2': 'premium_product_ratio'})
        )
        
        # Merge engagement metrics
        engagement_metrics = category_diversity.merge(
            premium_products,
            on='customer_id',
            how='outer'
        )
        
        return engagement_metrics
    
    def _calculate_transaction_features(self, transactions: pd.DataFrame) -> pd.DataFrame:
        """Calculate customer-level transaction features."""
        
        # Basic transaction metrics
        transaction_metrics = transactions.groupby('customer_id').agg({
            'transaction_id': 'count',
            'total_amount': ['mean', 'sum', 'std'],
            'date': ['min', 'max']
        }).reset_index()
        
        # Flatten column names
        transaction_metrics.columns = [
            'customer_id', 'transaction_count', 'avg_transaction_value',
            'total_spend', 'spend_std', 'first_purchase_date', 'last_purchase_date'
        ]
        
        # Calculate days between first and last purchase
        transaction_metrics['customer_age_days'] = (
            pd.to_datetime(transaction_metrics['last_purchase_date']) -
            pd.to_datetime(transaction_metrics['first_purchase_date'])
        ).dt.days
        
        # Calculate purchase frequency (transactions per month)
        transaction_metrics['purchase_frequency'] = (
            transaction_metrics['transaction_count'] /
            (transaction_metrics['customer_age_days'] / 30)
        ).fillna(0)
        
        # Calculate days since last purchase
        latest_date = pd.to_datetime(transactions['date']).max()
        transaction_metrics['days_since_last_purchase'] = (
            latest_date - pd.to_datetime(transaction_metrics['last_purchase_date'])
        ).dt.days
        
        return transaction_metrics
    

    
    def predict_churn_risk(self, 
                          features: pd.DataFrame, 
                          churn_threshold_days: int = 90) -> pd.DataFrame:
        """Predict customer churn risk."""
        
        # Define churn based on days since last purchase
        y = (features['days_since_last_purchase'] > churn_threshold_days).astype(int)
        
        # Select features for churn prediction
        churn_features = [
            'age', 'loyalty_score', 'purchase_frequency', 'avg_transaction_value',
            'category_diversity', 'premium_product_ratio'
        ]
        
        X = features[churn_features]
        
        # Split data and train model
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        # Train model
        self.churn_model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = self.churn_model.predict(X_test_scaled)
        y_pred_proba = self.churn_model.predict_proba(X_test_scaled)[:, 1]
        
        # Prepare results
        results = features.iloc[X_test.index].copy()
        results['churn_risk'] = y_pred_proba
        results['is_churned'] = y_pred
        
        # Add feature importance
        feature_importance = pd.DataFrame({
            'feature': churn_features,
            'importance': self.churn_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        return results, feature_importance
    
    def calculate_customer_ltv(self, 
                             features: pd.DataFrame, 
                             months_ahead: int = 12) -> pd.DataFrame:
        """Calculate customer lifetime value prediction."""
        
        # Calculate historical monthly value
        monthly_value = features['total_spend'] / (features['customer_age_days'] / 30)
        
        # Select features for LTV prediction
        ltv_features = [
            'age', 'loyalty_score', 'purchase_frequency', 'avg_transaction_value',
            'category_diversity', 'premium_product_ratio', 'days_since_last_purchase'
        ]
        
        X = features[ltv_features]
        y = monthly_value
        
        # Split data and train model
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        # Train model
        self.ltv_model.fit(X_train_scaled, y_train)
        
        # Make predictions
        monthly_value_pred = self.ltv_model.predict(X_test_scaled)
        
        # Calculate LTV
        ltv_predictions = features.iloc[X_test.index].copy()
        ltv_predictions['predicted_monthly_value'] = monthly_value_pred
        ltv_predictions['predicted_ltv'] = monthly_value_pred * months_ahead
        
        # Add feature importance
        feature_importance = pd.DataFrame({
            'feature': ltv_features,
            'importance': self.ltv_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        return ltv_predictions, feature_importance
    
    def generate_customer_segments(self, 
                                 features: pd.DataFrame,
                                 n_segments: int = 4) -> pd.DataFrame:
        """Generate value-based customer segments."""
        
        # Calculate RFM scores
        recency_score = pd.qcut(
            -features['days_since_last_purchase'],
            q=5,
            labels=['1', '2', '3', '4', '5']
        )
        
        frequency_score = pd.qcut(
            features['purchase_frequency'].clip(lower=0),
            q=5,
            labels=['1', '2', '3', '4', '5']
        )
        
        monetary_score = pd.qcut(
            features['total_spend'].clip(lower=0),
            q=5,
            labels=['1', '2', '3', '4', '5']
        )
        
        # Combine RFM scores
        features['rfm_score'] = (
            recency_score.astype(str) +
            frequency_score.astype(str) +
            monetary_score.astype(str)
        )
        
        # Define segment mapping
        segment_mapping = {
            '555': 'Champions',
            '554': 'Champions',
            '544': 'Loyal Customers',
            '535': 'Loyal Customers',
            '444': 'Regular Customers',
            '311': 'Lost Customers',
            '111': 'Lost Customers'
        }
        
        # Assign segments
        features['customer_segment'] = features['rfm_score'].map(
            lambda x: segment_mapping.get(x, 'Regular Customers')
        )
        
        return features

# Example usage
if __name__ == "__main__":
    # Assuming you have your synthetic data
    # generator = BeautyRetailDataGenerator(seed=42)
    # customers = generator.generate_customer_profiles(1000)
    # transactions = generator.generate_transactions(
    #     customers,
    #     '2023-01-01',
    #     '2024-01-01'
    # )
    
    # Initialize analytics suite
    analytics = CustomerAnalyticsSuite()
    
    # Prepare features
    features = analytics.prepare_customer_features(customers, transactions)
    
    # Get churn predictions
    churn_results, churn_importance = analytics.predict_churn_risk(features)
    print("\nChurn Risk Analysis:")
    print(churn_importance)
    
    # Get LTV predictions
    ltv_results, ltv_importance = analytics.calculate_customer_ltv(features)
    print("\nLTV Analysis:")
    print(ltv_importance)
    
    # Generate segments
    segmented_customers = analytics.generate_customer_segments(features)
    print("\nCustomer Segments:")
    print(segmented_customers['customer_segment'].value_counts())


Churn Risk Analysis:
                 feature  importance
3  avg_transaction_value    0.348021
1          loyalty_score    0.306373
2     purchase_frequency    0.242280
0                    age    0.103179
4     category_diversity    0.000147
5  premium_product_ratio    0.000000

LTV Analysis:
                    feature  importance
2        purchase_frequency    0.834208
3     avg_transaction_value    0.165573
1             loyalty_score    0.000092
6  days_since_last_purchase    0.000070
0                       age    0.000057
4        category_diversity    0.000000
5     premium_product_ratio    0.000000

Customer Segments:
customer_segment
Regular Customers    848
Lost Customers        83
Champions             51
Loyal Customers       18
Name: count, dtype: int64


_____