# XGBoost Local Training - Customer Churn

This notebook demonstrates XGBoost training locally without SageMaker, using the same customer churn dataset from the SageMaker examples.

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

## Load and Prepare Data

We'll use the same customer churn dataset preprocessing as the SageMaker examples.

In [None]:
# Sample data - in practice you'd load from S3 or local file
# This simulates the churn.txt dataset structure
sample_data = '''State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
KS,128,415,382-4657,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False.
OH,107,415,371-7191,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False.
NJ,137,415,358-1921,no,no,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False.
OH,84,408,375-9999,yes,no,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.'''

# For this demo, we'll simulate loading the full dataset
# In practice, replace this with: df = pd.read_csv('churn.txt')
print("Loading customer churn dataset...")
print("Dataset structure (first 4 rows):")
df_sample = pd.read_csv(StringIO(sample_data))
print(df_sample.head())
print(f"\nColumns: {list(df_sample.columns)}")
print(f"Target column 'Churn?' values: {df_sample['Churn?'].unique()}")

# For this notebook, we'll create a larger synthetic dataset based on the structure
print("\n[Note: In practice, load the full churn.txt dataset here]")
print("[For demo purposes, we'll create synthetic data with the same structure]")

In [None]:
# Create synthetic dataset with churn structure for demo
np.random.seed(42)
n_samples = 3000

# Generate synthetic features
data = {
    'Account Length': np.random.normal(100, 40, n_samples).astype(int),
    'VMail Message': np.random.poisson(8, n_samples),
    'Day Mins': np.random.normal(180, 50, n_samples),
    'Day Calls': np.random.poisson(100, n_samples),
    'Eve Mins': np.random.normal(200, 50, n_samples),
    'Eve Calls': np.random.poisson(100, n_samples),
    'Night Mins': np.random.normal(200, 50, n_samples),
    'Night Calls': np.random.poisson(100, n_samples),
    'Intl Mins': np.random.normal(10, 3, n_samples),
    'Intl Calls': np.random.poisson(4, n_samples),
    'CustServ Calls': np.random.poisson(1, n_samples)
}

# Add categorical features
states = ['CA', 'NY', 'TX', 'FL', 'IL', 'PA', 'OH', 'MI', 'NC', 'NJ']
area_codes = [408, 415, 510, 650, 707, 925]

data['State'] = np.random.choice(states, n_samples)
data['Area Code'] = np.random.choice(area_codes, n_samples)
data['Int\'l Plan'] = np.random.choice(['yes', 'no'], n_samples, p=[0.1, 0.9])
data['VMail Plan'] = np.random.choice(['yes', 'no'], n_samples, p=[0.3, 0.7])

# Create target with some logic (customers with high service calls more likely to churn)
churn_prob = 0.1 + 0.3 * (data['CustServ Calls'] > 3) + 0.2 * (data['Day Mins'] > 250)
data['Churn?'] = np.random.binomial(1, churn_prob, n_samples)
data['Churn?'] = ['True.' if x else 'False.' for x in data['Churn?']]

df = pd.DataFrame(data)
print(f"Created synthetic dataset with {len(df)} samples")
print(f"Churn distribution: {df['Churn?'].value_counts()}")
print(f"Churn rate: {(df['Churn?'] == 'True.').mean():.2%}")

## Data Preprocessing

Same preprocessing steps as the SageMaker example:
1. Handle categorical variables
2. Remove correlated features
3. Create target variable

In [None]:
# Cast Area Code to non-numeric (categorical)
df["Area Code"] = df["Area Code"].astype(object)

# One-hot encode categorical features
model_data = pd.get_dummies(df)
print(f"After one-hot encoding: {model_data.shape}")

# Move target column to the beginning (XGBoost convention)
target_col = "Churn?_True."
model_data = pd.concat([
    model_data[target_col],
    model_data.drop(["Churn?_False.", "Churn?_True."], axis=1)
], axis=1)

print(f"Final dataset shape: {model_data.shape}")
print(f"Features: {model_data.columns.tolist()[1:6]}...")  # Show first 5 features
print(f"Target distribution: {model_data[target_col].value_counts()}")

In [None]:
# Split data into train and validation datasets
train_data, validation_data = train_test_split(model_data, test_size=0.33, random_state=42, stratify=model_data[target_col])

# Further split validation into validation and test
validation_data, test_data = train_test_split(validation_data, test_size=0.33, random_state=42, stratify=validation_data[target_col])

print(f"Training set: {train_data.shape}")
print(f"Validation set: {validation_data.shape}")
print(f"Test set: {test_data.shape}")

# Prepare data for XGBoost
X_train = train_data.drop(target_col, axis=1)
y_train = train_data[target_col]

X_val = validation_data.drop(target_col, axis=1)
y_val = validation_data[target_col]

X_test = test_data.drop(target_col, axis=1)
y_test = test_data[target_col]

print(f"\nFeature columns: {X_train.shape[1]}")
print(f"Training target distribution: {y_train.value_counts()}")

## XGBoost Training

Now we'll train XGBoost using the native Python API, showing both the high-level sklearn-style interface and the lower-level XGBoost API.

In [None]:
# Method 1: XGBoost with sklearn-style API (easiest)
print("=== Training with XGBoost sklearn-style API ===")

model_sklearn = xgb.XGBClassifier(
    max_depth=5,
    learning_rate=0.2,
    n_estimators=100,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='auc'
)

# Fit with evaluation set
model_sklearn.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_names=['train', 'validation'],
    verbose=True
)

# Predictions
y_pred_sklearn = model_sklearn.predict(X_test)
y_proba_sklearn = model_sklearn.predict_proba(X_test)[:, 1]

print(f"\nSklearn-style API Results:")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_sklearn):.4f}")
print(f"Test AUC: {roc_auc_score(y_test, y_proba_sklearn):.4f}")

In [None]:
# Method 2: Native XGBoost API (more control)
print("\n=== Training with Native XGBoost API ===")

# Create DMatrix objects
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters (same as sklearn version above)
params = {
    'max_depth': 5,
    'eta': 0.2,  # learning_rate
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'seed': 42
}

# Training with validation monitoring
evallist = [(dtrain, 'train'), (dval, 'validation')]
num_rounds = 100

model_native = xgb.train(
    params, 
    dtrain, 
    num_rounds, 
    evallist,
    verbose_eval=10  # Print every 10 rounds
)

# Predictions
y_proba_native = model_native.predict(dtest)
y_pred_native = (y_proba_native > 0.5).astype(int)

print(f"\nNative API Results:")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_native):.4f}")
print(f"Test AUC: {roc_auc_score(y_test, y_proba_native):.4f}")

## Model Evaluation and Analysis

In [None]:
# Detailed evaluation using the sklearn-style model
print("=== Detailed Model Evaluation ===")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_sklearn))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_sklearn)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Churn', 'Churn'], 
            yticklabels=['No Churn', 'Churn'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model_sklearn.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
sns.barplot(data=top_features, y='feature', x='importance')
plt.title('Top 15 Feature Importances')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

## Save Model Locally

We'll save the model in XGBoost's native format, which is what SageMaker uses internally.

In [None]:
import os

# Create models directory
os.makedirs('models', exist_ok=True)

# Save sklearn-style model
model_sklearn.save_model('models/xgboost_sklearn_model.json')
print("Saved sklearn-style model to: models/xgboost_sklearn_model.json")

# Save native model (this is the format SageMaker uses)
model_native.save_model('models/xgboost_native_model.json')
print("Saved native model to: models/xgboost_native_model.json")

# Save feature names for later use
import json
with open('models/feature_names.json', 'w') as f:
    json.dump(X_train.columns.tolist(), f)
print("Saved feature names to: models/feature_names.json")

# Save model metadata
metadata = {
    'model_type': 'XGBoost',
    'n_features': X_train.shape[1],
    'n_classes': 2,
    'target_column': target_col,
    'test_accuracy': float(accuracy_score(y_test, y_pred_sklearn)),
    'test_auc': float(roc_auc_score(y_test, y_proba_sklearn)),
    'training_samples': len(X_train),
    'parameters': {
        'max_depth': 5,
        'learning_rate': 0.2,
        'n_estimators': 100,
        'subsample': 0.8,
        'colsample_bytree': 0.8
    }
}

with open('models/model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print("Saved model metadata to: models/model_metadata.json")

print("\n=== Model files saved successfully! ===")

## Load and Test Saved Model

Demonstrate loading the saved model and making predictions, simulating what would happen in a production environment.

In [None]:
# Load the saved model
loaded_model = xgb.Booster()
loaded_model.load_model('models/xgboost_native_model.json')

# Load feature names
with open('models/feature_names.json', 'r') as f:
    feature_names = json.load(f)

# Load metadata
with open('models/model_metadata.json', 'r') as f:
    metadata = json.load(f)

print("=== Testing Loaded Model ===")
print(f"Model type: {metadata['model_type']}")
print(f"Features: {metadata['n_features']}")
print(f"Training accuracy: {metadata['test_accuracy']:.4f}")
print(f"Training AUC: {metadata['test_auc']:.4f}")

# Test prediction with loaded model
dtest_loaded = xgb.DMatrix(X_test)
y_proba_loaded = loaded_model.predict(dtest_loaded)
y_pred_loaded = (y_proba_loaded > 0.5).astype(int)

# Verify predictions match
print(f"\nLoaded model test accuracy: {accuracy_score(y_test, y_pred_loaded):.4f}")
print(f"Loaded model test AUC: {roc_auc_score(y_test, y_proba_loaded):.4f}")
print(f"Predictions match original model: {np.array_equal(y_pred_native, y_pred_loaded)}")

# Sample prediction
sample_customer = X_test.iloc[0:1]
sample_prediction = loaded_model.predict(xgb.DMatrix(sample_customer))[0]
print(f"\nSample customer churn probability: {sample_prediction:.4f}")
print(f"Sample customer features (first 5): {dict(sample_customer.iloc[0][:5])}")

## Summary

This notebook demonstrates:

1. **Data preprocessing** identical to SageMaker examples
2. **Local XGBoost training** using both sklearn-style and native APIs
3. **Model evaluation** with metrics and visualizations
4. **Model persistence** in XGBoost native format

### Key Differences from SageMaker:

- **No cloud infrastructure** - runs entirely on local machine
- **Direct model access** - no need for endpoints or containers
- **Immediate results** - no waiting for training jobs or deployments
- **Full control** - access to all XGBoost parameters and features

### Next Steps:

- Compare results with SageMaker training
- Experiment with different hyperparameters locally before SageMaker training
- Use this for rapid prototyping and development
- Deploy to SageMaker when ready for production scaling