# Regression Analysis: Predicting Spending for NEW Customers

## The Cold Start Problem

**Scenario**: A new customer visits for the first time. You have:
- Their demographics (age, income, education, family)
- NO purchase history

**Question**: Can we predict spending using ONLY demographics?

| Notebook | Features | R2 | Use Case |
|----------|----------|-----|----------|
| 02_regression.ipynb | Demographics + History | 0.97 | Existing |
| 02b (this) | Demographics ONLY | 0.78 | New customers |

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

try:
    from xgboost import XGBRegressor
    HAS_XGBOOST = True
except ImportError:
    HAS_XGBOOST = False

RANDOM_STATE = 42
TEST_SIZE = 0.2
CV_FOLDS = 5
plt.style.use('seaborn-v0_8-whitegrid')
MAIN_COLOR = '#2ecc71'
SECONDARY_COLOR = '#3498db'

print(f"Libraries loaded. XGBoost: {HAS_XGBOOST}")

In [None]:
class IQRCapper(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, k=1.5):
        self.columns = columns
        self.k = k
        self.bounds_ = {}
    
    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        cols = self.columns if self.columns else X.columns
        for col in cols:
            if col in X.columns:
                Q1, Q3 = X[col].quantile(0.25), X[col].quantile(0.75)
                IQR = Q3 - Q1
                self.bounds_[col] = (Q1 - self.k * IQR, Q3 + self.k * IQR)
        return self
    
    def transform(self, X):
        X = pd.DataFrame(X).copy()
        for col, (lower, upper) in self.bounds_.items():
            if col in X.columns:
                X[col] = X[col].clip(lower, upper)
        return X.values

## 2. Data Loading - Demographics Only

In [None]:
df = pd.read_csv('Data/marketing_campaign.csv', sep='\t')
spending_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
df['TotalSpend'] = df[spending_cols].sum(axis=1)
df['Age'] = 2014 - df['Year_Birth']

print(f"Dataset: {df.shape}")
print(f"TotalSpend: Mean=${df['TotalSpend'].mean():.2f}, Median=${df['TotalSpend'].median():.2f}")

In [None]:
# ONLY demographic features - NO purchase history!
num_features = ['Income', 'Age', 'Kidhome', 'Teenhome']
cat_features = ['Education', 'Marital_Status']
demographic_features = num_features + cat_features

print("FEATURES USED (available for new customers):")
for f in demographic_features:
    print(f"  - {f}")

print("\nFEATURES EXCLUDED (require purchase history):")
for f in ['NumCatalogPurchases', 'NumWebPurchases', 'NumStorePurchases', 'Recency']:
    print(f"  - {f}")

In [None]:
df_clean = df.dropna(subset=['Income']).copy()
X = df_clean[demographic_features]
y = df_clean['TotalSpend']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")

## 3. Preprocessing

In [None]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('capper', IQRCapper(columns=None, k=1.5)),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_features),
    ('cat', categorical_pipeline, cat_features)
])

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print(f"Processed features: {X_train_processed.shape[1]}")

## 4. Model Comparison

In [None]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0, random_state=RANDOM_STATE),
    'Lasso': Lasso(alpha=0.1, random_state=RANDOM_STATE),
    'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=RANDOM_STATE),
    'DecisionTree': DecisionTreeRegressor(max_depth=10, random_state=RANDOM_STATE),
    'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=RANDOM_STATE, n_jobs=-1),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=RANDOM_STATE),
}
if HAS_XGBOOST:
    models['XGBoost'] = XGBRegressor(n_estimators=100, max_depth=5, random_state=RANDOM_STATE, verbosity=0)

print(f"Models: {len(models)}")

In [None]:
print("Baseline Model Comparison (5-Fold CV)")
print("=" * 55)

baseline_results = []
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train_processed, y_train, cv=CV_FOLDS, scoring='r2')
    baseline_results.append({'Model': name, 'CV_mean': cv_scores.mean(), 'CV_std': cv_scores.std()})
    print(f"{name:20} | R2 = {cv_scores.mean():.4f} +/- {cv_scores.std():.4f}")

baseline_df = pd.DataFrame(baseline_results).sort_values('CV_mean', ascending=False).reset_index(drop=True)
print(f"\nBest: {baseline_df.iloc[0]['Model']} (R2 = {baseline_df.iloc[0]['CV_mean']:.4f})")

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
colors = [MAIN_COLOR if i == 0 else SECONDARY_COLOR for i in range(len(baseline_df))]
ax.barh(baseline_df['Model'], baseline_df['CV_mean'], xerr=baseline_df['CV_std'], color=colors, capsize=5)
ax.set_xlabel('R2 Score')
ax.set_title('NEW Customer Spending Prediction (Demographics Only)')
plt.tight_layout()
plt.show()

## 5. Hyperparameter Tuning

In [None]:
top_models = baseline_df.head(3)['Model'].tolist()
param_grids = {
    'RandomForest': {'n_estimators': [100, 200], 'max_depth': [5, 10, 15]},
    'GradientBoosting': {'n_estimators': [100, 200], 'max_depth': [3, 5], 'learning_rate': [0.1, 0.2]},
    'XGBoost': {'n_estimators': [100, 200], 'max_depth': [3, 5], 'learning_rate': [0.1, 0.2]},
}

tuned_models = {}
for name in top_models:
    print(f"Tuning {name}...")
    if name in param_grids:
        grid = GridSearchCV(models[name], param_grids[name], cv=CV_FOLDS, scoring='r2', n_jobs=-1)
        grid.fit(X_train_processed, y_train)
        tuned_models[name] = grid.best_estimator_
        print(f"  Best R2: {grid.best_score_:.4f}")
    else:
        models[name].fit(X_train_processed, y_train)
        tuned_models[name] = models[name]
print("Done!")

## 6. Final Evaluation

In [None]:
print("Final Test Set Results")
print("=" * 55)

final_results = []
for name, model in tuned_models.items():
    test_r2 = r2_score(y_test, model.predict(X_test_processed))
    test_rmse = np.sqrt(mean_squared_error(y_test, model.predict(X_test_processed)))
    final_results.append({'Model': name, 'R2_test': test_r2, 'RMSE': test_rmse})
    print(f"{name}: R2={test_r2:.4f}, RMSE=${test_rmse:.2f}")

results_df = pd.DataFrame(final_results).sort_values('R2_test', ascending=False)
best_name = results_df.iloc[0]['Model']
best_r2 = results_df.iloc[0]['R2_test']
best_model = tuned_models[best_name]

print(f"\nBEST: {best_name} with R2 = {best_r2:.4f}")

## 7. Feature Importance

In [None]:
cat_encoder = preprocessor.named_transformers_['cat'].named_steps['encoder']
feature_names = num_features + cat_encoder.get_feature_names_out(cat_features).tolist()

if hasattr(best_model, 'feature_importances_'):
    imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': best_model.feature_importances_})
    imp_df = imp_df.sort_values('Importance', ascending=False)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.barh(imp_df['Feature'], imp_df['Importance'], color=MAIN_COLOR)
    ax.set_xlabel('Importance')
    ax.set_title('Feature Importance (Demographics Only)')
    ax.invert_yaxis()
    plt.tight_layout()
    plt.show()
    print(imp_df.to_string(index=False))

## 8. Comparison with Full Model

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
bars = ax.bar(['Demographics Only\n(New Customers)', 'Full Features\n(Existing)'], [best_r2, 0.97], color=[MAIN_COLOR, SECONDARY_COLOR])
ax.set_ylabel('R2 Score')
ax.set_title('Demographics vs Full Features')
ax.set_ylim(0, 1.1)
for bar, val in zip(bars, [best_r2, 0.97]):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f'{val:.2f}', ha='center', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print(f"\nGap: {(0.97 - best_r2)*100:.1f}% variance explained by behavioral features")

## 9. Example Predictions

In [None]:
examples = pd.DataFrame([
    {'Income': 100000, 'Age': 45, 'Education': 'PhD', 'Marital_Status': 'Married', 'Kidhome': 0, 'Teenhome': 1},
    {'Income': 50000, 'Age': 30, 'Education': 'Graduation', 'Marital_Status': 'Single', 'Kidhome': 0, 'Teenhome': 0},
    {'Income': 30000, 'Age': 25, 'Education': 'Basic', 'Marital_Status': 'Single', 'Kidhome': 1, 'Teenhome': 0},
])

preds = best_model.predict(preprocessor.transform(examples))
print("New Customer Predictions:")
for i, (_, row) in enumerate(examples.iterrows()):
    print(f"  Customer {i+1}: Income=${row['Income']:,} -> Predicted=${preds[i]:,.0f}")

## 10. Conclusion

**Key Findings:**
- Demographics explain ~78% of spending variance
- Income is the dominant predictor
- Behavioral data adds ~19% more accuracy

**Use Cases:**
| Stage | Model | R2 |
|-------|-------|----|
| New customer | This model | 0.78 |
| Established | Full model | 0.97 |