# 04 - CLV Prediction Modeling

**Customer Lifetime Value Prediction**

**Team:** The Starks
- Othmane Zizi (261255341)
- Fares Joni (261254593)
- Tanmay Giri (261272443)

This notebook trains and evaluates multiple ML models for CLV prediction.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Add src to path
sys.path.append(str(Path('../src').resolve()))
from data_loader import load_customer_features
from models import get_models, train_and_evaluate, get_feature_importance, cross_validate_model

import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_columns', None)

## 1. Load Feature Dataset

In [None]:
# Load customer features with CLV target
df = load_customer_features('customer_features.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## 2. Prepare Features and Target

In [None]:
# Define feature columns and target
feature_cols = ['Recency', 'Frequency', 'Monetary', 'Tenure', 
                'AvgTimeBetweenPurchases', 'NumUniqueProducts', 
                'AvgBasketSize', 'AvgOrderValue']

target_col = 'CLV'

X = df[feature_cols].copy()
y = df[target_col].copy()

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget statistics:")
print(y.describe())

In [None]:
# Check for any remaining missing values
print("Missing values in features:")
print(X.isnull().sum().sum())

# Fill any missing values with 0
X = X.fillna(0)
y = y.fillna(0)

## 3. Train/Test Split

In [None]:
# Split data (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

## 4. Scale Features

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=feature_cols,
    index=X_train.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=feature_cols,
    index=X_test.index
)

print("Scaled feature statistics (training):")
X_train_scaled.describe().loc[['mean', 'std']]

## 5. Train Multiple Models

In [None]:
# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=1.0),
    'Random Forest': RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    ),
    'Gradient Boosting': GradientBoostingRegressor(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        random_state=42
    )
}

In [None]:
# Train and evaluate all models
results = {}
trained_models = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train
    model.fit(X_train_scaled, y_train)
    trained_models[name] = model
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    y_pred = np.clip(y_pred, 0, None)  # CLV can't be negative
    
    # Evaluate
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }
    
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE: {mae:.2f}")
    print(f"  R2: {r2:.4f}")

## 6. Model Comparison

In [None]:
# Create results dataframe
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('RMSE')

print("Model Performance Comparison:")
results_df

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

metrics = ['RMSE', 'MAE', 'R2']
colors = ['coral', 'steelblue', 'green']

for ax, metric, color in zip(axes, metrics, colors):
    results_df[metric].plot(kind='bar', ax=ax, color=color, edgecolor='black')
    ax.set_title(f'{metric} by Model')
    ax.set_xlabel('Model')
    ax.set_ylabel(metric)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../reports/figures/model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Best Model Analysis

In [None]:
# Select best model (lowest RMSE)
best_model_name = results_df['RMSE'].idxmin()
best_model = trained_models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"\nPerformance:")
print(results_df.loc[best_model_name])

In [None]:
# Predictions vs Actual
y_pred_best = best_model.predict(X_test_scaled)
y_pred_best = np.clip(y_pred_best, 0, None)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot: Predicted vs Actual
axes[0].scatter(y_test, y_pred_best, alpha=0.5, s=10)
max_val = max(y_test.max(), y_pred_best.max())
axes[0].plot([0, max_val], [0, max_val], 'r--', label='Perfect Prediction')
axes[0].set_xlabel('Actual CLV')
axes[0].set_ylabel('Predicted CLV')
axes[0].set_title(f'{best_model_name}: Predicted vs Actual CLV')
axes[0].legend()

# Residuals
residuals = y_test - y_pred_best
axes[1].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[1].axvline(x=0, color='r', linestyle='--')
axes[1].set_xlabel('Residual (Actual - Predicted)')
axes[1].set_ylabel('Frequency')
axes[1].set_title(f'{best_model_name}: Residual Distribution')

plt.tight_layout()
plt.savefig('../reports/figures/best_model_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Feature Importance

In [None]:
# Get feature importance from tree-based models
importance_dict = {}

for name in ['Random Forest', 'Gradient Boosting']:
    model = trained_models[name]
    importance_dict[name] = model.feature_importances_

# Also get coefficients from linear models
for name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression']:
    model = trained_models[name]
    importance_dict[name] = np.abs(model.coef_)

importance_df = pd.DataFrame(importance_dict, index=feature_cols)
print("Feature Importance by Model:")
importance_df

In [None]:
# Visualize feature importance (from best tree-based model)
if 'Random Forest' in best_model_name or 'Gradient Boosting' in best_model_name:
    fi = importance_df[best_model_name].sort_values(ascending=True)
else:
    fi = importance_df['Random Forest'].sort_values(ascending=True)

plt.figure(figsize=(10, 6))
fi.plot(kind='barh', color='steelblue', edgecolor='black')
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig('../reports/figures/feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

## 9. Cross-Validation

In [None]:
# Cross-validation for best model
from sklearn.model_selection import cross_val_score

cv_results = {}

for name, model in models.items():
    print(f"Cross-validating {name}...")
    
    # 5-fold CV
    r2_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    neg_rmse = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_root_mean_squared_error')
    
    cv_results[name] = {
        'R2_mean': r2_scores.mean(),
        'R2_std': r2_scores.std(),
        'RMSE_mean': -neg_rmse.mean(),
        'RMSE_std': neg_rmse.std()
    }

cv_df = pd.DataFrame(cv_results).T
print("\nCross-Validation Results (5-fold):")
cv_df

## 10. Lift Analysis (Business Metric)

In [None]:
# Lift analysis: How well do we identify high-value customers?
test_results = pd.DataFrame({
    'Actual_CLV': y_test,
    'Predicted_CLV': y_pred_best
})

# Sort by predicted CLV
test_results = test_results.sort_values('Predicted_CLV', ascending=False)

# Calculate cumulative actual CLV for top N%
test_results['Cumulative_Actual'] = test_results['Actual_CLV'].cumsum()
test_results['Cumulative_Pct'] = test_results['Cumulative_Actual'] / test_results['Actual_CLV'].sum()
test_results['Customer_Pct'] = np.arange(1, len(test_results) + 1) / len(test_results)

# Lift chart
plt.figure(figsize=(10, 6))
plt.plot(test_results['Customer_Pct'] * 100, test_results['Cumulative_Pct'] * 100, 
         label='Model', linewidth=2)
plt.plot([0, 100], [0, 100], 'r--', label='Random', linewidth=2)
plt.xlabel('% of Customers (Sorted by Predicted CLV)')
plt.ylabel('% of Total Actual CLV Captured')
plt.title('Lift Chart: Model vs Random Selection')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../reports/figures/lift_chart.png', dpi=150, bbox_inches='tight')
plt.show()

# Calculate lift at different thresholds
for pct in [10, 20, 30, 50]:
    top_n = int(len(test_results) * pct / 100)
    captured = test_results.head(top_n)['Actual_CLV'].sum() / test_results['Actual_CLV'].sum()
    lift = captured / (pct / 100)
    print(f"Top {pct}% customers capture {captured*100:.1f}% of CLV (Lift: {lift:.2f}x)")

## 11. Save Model and Results

In [None]:
# Save results
results_df.to_csv('../reports/model_results.csv')
cv_df.to_csv('../reports/cv_results.csv')
importance_df.to_csv('../reports/feature_importance.csv')

print("Results saved to reports/")

# Save predictions for segmentation
predictions_df = df[['Customer ID']].copy()
X_all_scaled = pd.DataFrame(
    scaler.transform(X),
    columns=feature_cols,
    index=X.index
)
predictions_df['Predicted_CLV'] = np.clip(best_model.predict(X_all_scaled), 0, None)
predictions_df['Actual_CLV'] = y

predictions_df.to_csv('../data/processed/clv_predictions.csv', index=False)
print(f"\nPredictions saved: {len(predictions_df)} customers")

## 12. Modeling Summary

### Models Trained:
1. Linear Regression (baseline)
2. Ridge Regression (L2 regularization)
3. Lasso Regression (L1 regularization)
4. Random Forest
5. Gradient Boosting

### Key Findings:
- Tree-based models (Random Forest, Gradient Boosting) generally outperform linear models
- Most important features: Monetary, Frequency, AvgOrderValue
- Model can identify top 20% of customers capturing ~X% of total CLV

### Business Impact:
- Using the model to target top 20% customers provides significant lift over random selection
- Enables prioritization of marketing resources on high-value customers

### Next Steps:
- Use predictions for customer segmentation
- Develop targeted retention strategies for each segment