# Hyperparameter Optimization for Upsell Prediction
## AI Customer Upsell Prediction System

This notebook implements:
- Bayesian Optimization with Optuna
- Grid Search with Cross-Validation
- Automated hyperparameter tuning
- Performance comparison
- Best model selection and saving
- Advanced ensemble optimization

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# ML libraries
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import (
    train_test_split, cross_val_score, StratifiedKFold,
    GridSearchCV, RandomizedSearchCV
)
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler

# Optimization libraries
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

import joblib
import json
import time
import warnings
warnings.filterwarnings('ignore')
# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)

# Set optuna logging level
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [2]:
# Refined Hyperparameter Optimization for Smart Customer Segmentation
# Focus: Multi-class optimization for 6-category segmentation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ML libraries
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Optimization
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

import joblib
import json
import time
import warnings
warnings.filterwarnings('ignore')

print("🎯 Smart Customer Segmentation - Hyperparameter Optimization")
print("=" * 65)
print("Focus: Multi-class optimization for 6-category customer intelligence")
print("=" * 65)


🎯 Smart Customer Segmentation - Hyperparameter Optimization
Focus: Multi-class optimization for 6-category customer intelligence


In [3]:
# ===============================================================================
# LOAD DATA FOR SMART SEGMENTATION OPTIMIZATION
# ===============================================================================

print("📊 Loading Data for Smart Segmentation Optimization...")

# Load processed data with smart segmentation
df = pd.read_csv('../data/processed/telecom_processed.csv')
feature_columns = joblib.load('../models/feature_columns.pkl')

# Prepare features and target for MULTI-CLASS segmentation
X = df[feature_columns].fillna(0).replace([np.inf, -np.inf], 0)
y = df['Upsell_Priority_Encoded']  # 6-class smart segmentation target

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Optimization validation split
X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f"✅ Data loaded for smart segmentation optimization")
print(f"   Training set: {X_train_opt.shape}")
print(f"   Validation set: {X_val_opt.shape}")
print(f"   Test set: {X_test.shape}")
print(f"   Smart segments: {len(np.unique(y))} categories")
print(f"   Features: {len(feature_columns)}")

# Display segment distribution
segment_names = ['DO_NOT_DISTURB', 'FIX_FIRST_THEN_UPSELL', 'GENTLE_UPSELL', 
                'MINIMAL_CONTACT', 'PRIORITY_UPSELL_RETENTION', 'STANDARD_UPSELL']
print(f"\n📊 Smart Segment Distribution:")
for i, name in enumerate(segment_names):
    count = (y_train == i).sum()
    pct = (count / len(y_train)) * 100
    print(f"   {name}: {count:,} ({pct:.1f}%)")


📊 Loading Data for Smart Segmentation Optimization...
✅ Data loaded for smart segmentation optimization
   Training set: (38684, 41)
   Validation set: (9672, 41)
   Test set: (12089, 41)
   Smart segments: 6 categories
   Features: 41

📊 Smart Segment Distribution:
   DO_NOT_DISTURB: 6,590 (13.6%)
   FIX_FIRST_THEN_UPSELL: 11,398 (23.6%)
   GENTLE_UPSELL: 2,899 (6.0%)
   MINIMAL_CONTACT: 727 (1.5%)
   PRIORITY_UPSELL_RETENTION: 5,341 (11.0%)
   STANDARD_UPSELL: 21,401 (44.3%)


In [4]:
# ===============================================================================
# XGBOOST MULTI-CLASS OPTIMIZATION
# ===============================================================================

print("\n🚀 XGBoost Multi-Class Optimization for Smart Segmentation")
print("=" * 65)

def objective_xgboost_multiclass(trial):
    """Optimized for 6-class smart segmentation"""
    
    params = {
        'objective': 'multi:softprob',  # Multi-class with probabilities
        'num_class': 6,  # 6 smart segments
        'eval_metric': 'mlogloss',
        'tree_method': 'hist',
        'random_state': 42,
        
        # Optimized parameter ranges for multi-class
        'n_estimators': trial.suggest_int('n_estimators', 200, 800),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'gamma': trial.suggest_float('gamma', 0, 0.3),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 0.5)
    }
    
    model = xgb.XGBClassifier(**params)
    model.fit(X_train_opt, y_train_opt)
    y_pred = model.predict(X_val_opt)
    
    return accuracy_score(y_val_opt, y_pred)

# Optimize XGBoost for smart segmentation
study_xgb = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    pruner=MedianPruner(n_startup_trials=10, n_warmup_steps=5)
)

print("🔄 Optimizing XGBoost for smart segmentation...")
start_time = time.time()

study_xgb.optimize(objective_xgboost_multiclass, n_trials=30, timeout=300)

optimization_time = time.time() - start_time
print(f"✅ XGBoost optimization completed in {optimization_time:.1f} seconds")
print(f"   Best Accuracy: {study_xgb.best_value:.4f}")
print(f"   Best parameters: {study_xgb.best_params}")



🚀 XGBoost Multi-Class Optimization for Smart Segmentation
🔄 Optimizing XGBoost for smart segmentation...
✅ XGBoost optimization completed in 198.3 seconds
   Best Accuracy: 0.9369
   Best parameters: {'n_estimators': 799, 'max_depth': 4, 'learning_rate': 0.27224067980763167, 'subsample': 0.9913645450756944, 'colsample_bytree': 0.8615495665528896, 'min_child_weight': 5, 'gamma': 0.059981933766999077, 'reg_alpha': 0.2317959276605608, 'reg_lambda': 0.24170948532471842}


In [5]:
# ===============================================================================
# LIGHTGBM MULTI-CLASS OPTIMIZATION
# ===============================================================================

print("\n🚀 LightGBM Multi-Class Optimization for Smart Segmentation")
print("=" * 65)

def objective_lightgbm_multiclass(trial):
    """Optimized for 6-class smart segmentation"""
    
    params = {
        'objective': 'multiclass',
        'num_class': 6,
        'metric': 'multi_logloss',
        'device': 'cpu',
        'random_state': 42,
        'verbose': -1,
        
        # Optimized parameter ranges for multi-class
        'n_estimators': trial.suggest_int('n_estimators', 200, 800),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 0.5)
    }
    
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train_opt, y_train_opt)
    y_pred = model.predict(X_val_opt)
    
    return accuracy_score(y_val_opt, y_pred)

# Optimize LightGBM for smart segmentation
study_lgb = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    pruner=MedianPruner(n_startup_trials=10, n_warmup_steps=5)
)

print("🔄 Optimizing LightGBM for smart segmentation...")
start_time = time.time()

study_lgb.optimize(objective_lightgbm_multiclass, n_trials=30, timeout=300)

optimization_time = time.time() - start_time
print(f"✅ LightGBM optimization completed in {optimization_time:.1f} seconds")
print(f"   Best Accuracy: {study_lgb.best_value:.4f}")
print(f"   Best parameters: {study_lgb.best_params}")



🚀 LightGBM Multi-Class Optimization for Smart Segmentation
🔄 Optimizing LightGBM for smart segmentation...
✅ LightGBM optimization completed in 315.9 seconds
   Best Accuracy: 0.9347
   Best parameters: {'n_estimators': 680, 'num_leaves': 74, 'learning_rate': 0.09760597956585301, 'feature_fraction': 0.9196076970917365, 'bagging_fraction': 0.8921283575429872, 'bagging_freq': 6, 'min_child_samples': 83, 'reg_alpha': 0.18840406220390196, 'reg_lambda': 0.2094572579964176}


In [6]:
# ===============================================================================
# TRAIN AND EVALUATE OPTIMIZED MODELS
# ===============================================================================

print("\n🚀 Training Optimized Models for Smart Segmentation")
print("=" * 60)

# Train optimized XGBoost
print("🔄 Training optimized XGBoost...")
xgb_params = study_xgb.best_params.copy()
xgb_params.update({
    'objective': 'multi:softprob',
    'num_class': 6,
    'eval_metric': 'mlogloss',
    'tree_method': 'hist',
    'random_state': 42
})

xgb_optimized = xgb.XGBClassifier(**xgb_params)
xgb_optimized.fit(X_train, y_train)

# Train optimized LightGBM
print("🔄 Training optimized LightGBM...")
lgb_params = study_lgb.best_params.copy()
lgb_params.update({
    'objective': 'multiclass',
    'num_class': 6,
    'metric': 'multi_logloss',
    'device': 'cpu',
    'random_state': 42,
    'verbose': -1
})

lgb_optimized = lgb.LGBMClassifier(**lgb_params)
lgb_optimized.fit(X_train, y_train)

print("✅ Optimized models trained successfully!")



🚀 Training Optimized Models for Smart Segmentation
🔄 Training optimized XGBoost...
🔄 Training optimized LightGBM...
✅ Optimized models trained successfully!


In [7]:
# ===============================================================================
# EVALUATE AND COMPARE OPTIMIZED MODELS
# ===============================================================================

print("\n📊 Evaluating Optimized Smart Segmentation Models")
print("=" * 60)

# Load baseline model for comparison
try:
    baseline_model = joblib.load('../models/best_model_xgboost.pkl')
    print("✅ Loaded baseline XGBoost model for comparison")
except:
    baseline_model = None
    print("⚠️  Baseline model not found - using current optimized as reference")

models = {
    'XGBoost Optimized': xgb_optimized,
    'LightGBM Optimized': lgb_optimized
}

if baseline_model:
    models['XGBoost Baseline'] = baseline_model

results = {}

print("\n🔄 Evaluating models on test set...")
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    
    # Multi-class AUC
    try:
        auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted')
    except:
        auc = 0.0
    
    results[name] = {
        'Accuracy': accuracy,
        'AUC': auc
    }
    
    print(f"\n📈 {name}:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   Weighted AUC: {auc:.4f}")

# Create comparison
results_df = pd.DataFrame(results).T.sort_values('Accuracy', ascending=False)
print(f"\n🏆 Smart Segmentation Model Comparison:")
print(results_df.round(4))

best_model_name = results_df.index[0]
best_accuracy = results_df.iloc[0]['Accuracy']
print(f"\n🥇 Best Optimized Model: {best_model_name} (Accuracy: {best_accuracy:.4f})")



📊 Evaluating Optimized Smart Segmentation Models
✅ Loaded baseline XGBoost model for comparison

🔄 Evaluating models on test set...

📈 XGBoost Optimized:
   Accuracy: 0.9446
   Weighted AUC: 0.9929

📈 LightGBM Optimized:
   Accuracy: 0.9421
   Weighted AUC: 0.9911

📈 XGBoost Baseline:
   Accuracy: 0.9432
   Weighted AUC: 0.9925

🏆 Smart Segmentation Model Comparison:
                    Accuracy     AUC
XGBoost Optimized     0.9446  0.9929
XGBoost Baseline      0.9432  0.9925
LightGBM Optimized    0.9421  0.9911

🥇 Best Optimized Model: XGBoost Optimized (Accuracy: 0.9446)


In [8]:
# ===============================================================================
# BUSINESS IMPACT ANALYSIS WITH OPTIMIZED MODEL
# ===============================================================================

print("\n💼 Business Impact Analysis - Optimized vs Baseline")
print("=" * 60)

# Use best optimized model
best_model = models[best_model_name]
y_pred_optimized = best_model.predict(X_test)

# Segment-specific business impact
segment_strategies = {
    0: {'name': 'DO_NOT_DISTURB', 'cost_mult': 0.02, 'success_rate': 0.05, 'value_mult': 1.0},
    1: {'name': 'FIX_FIRST_THEN_UPSELL', 'cost_mult': 0.15, 'success_rate': 0.35, 'value_mult': 1.5},
    2: {'name': 'GENTLE_UPSELL', 'cost_mult': 0.08, 'success_rate': 0.20, 'value_mult': 1.2},
    3: {'name': 'MINIMAL_CONTACT', 'cost_mult': 0.03, 'success_rate': 0.10, 'value_mult': 1.0},
    4: {'name': 'PRIORITY_UPSELL_RETENTION', 'cost_mult': 0.25, 'success_rate': 0.45, 'value_mult': 2.0},
    5: {'name': 'STANDARD_UPSELL', 'cost_mult': 0.10, 'success_rate': 0.25, 'value_mult': 1.3}
}

# Calculate business impact
test_df = X_test.copy()
test_df['Predicted_Segment'] = y_pred_optimized
test_df['Total_Charges'] = df.loc[X_test.index, 'Total_Charges'].values

total_revenue_impact = 0
total_intervention_cost = 0

print("📊 Optimized Model Business Impact by Segment:")
for segment_id, strategy in segment_strategies.items():
    segment_data = test_df[test_df['Predicted_Segment'] == segment_id]
    count = len(segment_data)
    
    if count == 0:
        continue
    
    avg_revenue = segment_data['Total_Charges'].mean()
    customers_impacted = count * strategy['success_rate']
    intervention_cost = count * avg_revenue * strategy['cost_mult']
    revenue_impact = customers_impacted * avg_revenue * (strategy['value_mult'] - 1.0) * 12
    net_benefit = revenue_impact - intervention_cost
    roi = (net_benefit / intervention_cost * 100) if intervention_cost > 0 else 0
    
    total_revenue_impact += revenue_impact
    total_intervention_cost += intervention_cost
    
    print(f"\n  {strategy['name']}:")
    print(f"    Customers: {count:,}")
    print(f"    Revenue Impact: USD {revenue_impact:,.2f}")
    print(f"    Net Benefit: USD {net_benefit:,.2f}")
    print(f"    ROI: {roi:.1f}%")

# Overall impact
overall_net_benefit = total_revenue_impact - total_intervention_cost
overall_roi = (overall_net_benefit / total_intervention_cost * 100) if total_intervention_cost > 0 else 0

print(f"\n💰 Overall Optimized Model Impact:")
print(f"   Total Revenue Impact: USD {total_revenue_impact:,.2f}")
print(f"   Total Intervention Cost: USD {total_intervention_cost:,.2f}")
print(f"   Net Benefit: USD {overall_net_benefit:,.2f}")
print(f"   ROI: {overall_roi:.1f}%")

# Compare with baseline (if available)
if baseline_model and 'XGBoost Baseline' in results:
    baseline_accuracy = results['XGBoost Baseline']['Accuracy']
    improvement = ((best_accuracy - baseline_accuracy) / baseline_accuracy) * 100
    print(f"\n📈 Optimization Impact:")
    print(f"   Baseline Accuracy: {baseline_accuracy:.4f}")
    print(f"   Optimized Accuracy: {best_accuracy:.4f}")
    print(f"   Improvement: +{improvement:.2f}%")



💼 Business Impact Analysis - Optimized vs Baseline
📊 Optimized Model Business Impact by Segment:

  DO_NOT_DISTURB:
    Customers: 1,684
    Revenue Impact: USD 0.00
    Net Benefit: USD -4,489.16
    ROI: -100.0%

  FIX_FIRST_THEN_UPSELL:
    Customers: 2,964
    Revenue Impact: USD 787,864.71
    Net Benefit: USD 731,588.66
    ROI: 1300.0%

  GENTLE_UPSELL:
    Customers: 731
    Revenue Impact: USD 37,991.66
    Net Benefit: USD 31,659.72
    ROI: 500.0%

  MINIMAL_CONTACT:
    Customers: 183
    Revenue Impact: USD 0.00
    Net Benefit: USD -511.86
    ROI: -100.0%

  PRIORITY_UPSELL_RETENTION:
    Customers: 1,116
    Revenue Impact: USD 794,618.64
    Net Benefit: USD 757,830.74
    ROI: 2060.0%

  STANDARD_UPSELL:
    Customers: 5,411
    Revenue Impact: USD 540,744.46
    Net Benefit: USD 480,661.75
    ROI: 800.0%

💰 Overall Optimized Model Impact:
   Total Revenue Impact: USD 2,161,219.48
   Total Intervention Cost: USD 164,479.63
   Net Benefit: USD 1,996,739.85
   ROI: 12

In [9]:
# ===============================================================================
# SAVE OPTIMIZED MODELS AND RESULTS
# ===============================================================================

print("\n💾 Saving Optimized Smart Segmentation Models")
print("=" * 55)

import os
os.makedirs('../models/optimized', exist_ok=True)
os.makedirs('../outputs/optimization', exist_ok=True)

# Save optimized models
joblib.dump(xgb_optimized, '../models/optimized/xgboost_smart_segmentation.pkl')
joblib.dump(lgb_optimized, '../models/optimized/lightgbm_smart_segmentation.pkl')
print("✅ Saved optimized models")

# Save optimization results
optimization_summary = {
    'optimization_focus': 'Smart Customer Segmentation (6-class)',
    'best_model': best_model_name,
    'best_accuracy': float(best_accuracy),
    'xgboost_params': study_xgb.best_params,
    'lightgbm_params': study_lgb.best_params,
    'business_impact': {
        'total_revenue_impact': float(total_revenue_impact),
        'total_intervention_cost': float(total_intervention_cost),
        'net_benefit': float(overall_net_benefit),
        'roi': float(overall_roi)
    },
    'model_comparison': results_df.to_dict()
}

with open('../outputs/optimization/smart_segmentation_optimization.json', 'w') as f:
    json.dump(optimization_summary, f, indent=2)

print("✅ Saved optimization results")

# Final summary
print(f"\n🎉 SMART SEGMENTATION HYPERPARAMETER OPTIMIZATION COMPLETE!")
print(f"=" * 70)
print(f"🏆 Best Model: {best_model_name}")
print(f"📊 Accuracy: {best_accuracy:.4f}")
print(f"💰 ROI: {overall_roi:.1f}%")
print(f"🎯 Status: Ready for enhanced production deployment")



💾 Saving Optimized Smart Segmentation Models
✅ Saved optimized models
✅ Saved optimization results

🎉 SMART SEGMENTATION HYPERPARAMETER OPTIMIZATION COMPLETE!
🏆 Best Model: XGBoost Optimized
📊 Accuracy: 0.9446
💰 ROI: 1214.0%
🎯 Status: Ready for enhanced production deployment
