# Term Deposit Marketing Prediction Models

This notebook builds two predictive models for term deposit marketing:
1. **Pre-Call Model**: Predicts which customers to call before making any calls (excludes campaign-related features)
2. **Post-Call Model**: Predicts which customers to focus on after initial contact (includes all features)

We'll compare multiple models and select the top 3 for each scenario.

## 1. Setup and Data Preparation

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, auc
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Set display options
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

In [None]:
# Load the dataset
file_path = "term-deposit-marketing-2020.csv"
df = pd.read_csv(file_path)
print(f"Data loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print("Columns:", df.columns.tolist())
df.head()

In [None]:
# Check for missing values and duplicates
missing = df.isnull().sum()
duplicates = df.duplicated().sum()
print(f"Missing values per column:\n{missing}\n")
print(f"Number of duplicated rows: {duplicates}\n")

In [None]:
# Check class distribution
df["y"].value_counts(normalize=True).mul(100).round(2)

### Identify Campaign-Related Features

For our first model, we need to exclude campaign-related features that would not be available before making calls.

In [None]:
# Define campaign-related features that will be excluded from Model 1
campaign_features = ['duration', 'day', 'month', 'campaign']

# Define features for Model 1 (pre-call)
model1_features = [col for col in df.columns if col not in campaign_features and col != 'y']
print("Features for Model 1 (pre-call):\n", model1_features)

# Define features for Model 2 (post-call)
model2_features = [col for col in df.columns if col != 'y']
print("\nFeatures for Model 2 (post-call):\n", model2_features)

## 2. Model 1: Pre-Call Prediction

This model will help predict which customers to call before making any calls, using only features available before the campaign.

In [None]:
# Create dataset for Model 1
model1_data = df[model1_features + ['y']].copy()
model1_data.head()

In [None]:
# Prepare data for Model 1
# Convert target to binary
model1_data['y_binary'] = model1_data['y'].map({'yes': 1, 'no': 0})

# Split features and target
X1 = model1_data.drop(['y', 'y_binary'], axis=1)
y1 = model1_data['y_binary']

# Identify categorical and numerical features
categorical_features = X1.select_dtypes(include=['object']).columns.tolist()
numerical_features = X1.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical features: {categorical_features}")
print(f"Numerical features: {numerical_features}")

# Create preprocessing pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split data into train and test sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42, stratify=y1)

In [None]:
# Define models to evaluate for Model 1
models1 = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier()
}

### Evaluate Models for Pre-Call Prediction

In [None]:
# Function to evaluate models with classification report and confusion matrix
def evaluate_model(model_name, model, X_train, X_test, y_train, y_test, preprocessor):
    # Create pipeline with preprocessing and model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    
    # Classification report
    print(f"\n{model_name} - Classification Report:")
    report = classification_report(y_test, y_pred)
    print(report)
    
    # Confusion Matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()
    
    # Calculate metrics
    tn, fp, fn, tp = cm.ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Plot ROC curve
    plt.figure(figsize=(8, 6))
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend(loc='lower right')
    plt.show()
    
    print(f"\nObservations for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"True Positives: {tp} - Correctly predicted subscribers")
    print(f"False Positives: {fp} - Incorrectly predicted as subscribers")
    print(f"True Negatives: {tn} - Correctly predicted non-subscribers")
    print(f"False Negatives: {fn} - Missed potential subscribers")
    
    # Try to get feature importance if available
    if hasattr(model, 'feature_importances_') or hasattr(model, 'coef_'):
        # Get feature names after preprocessing
        feature_names = []
        for name, transformer, features in preprocessor.transformers_:
            if name == 'cat':
                # Get one-hot encoded feature names
                encoder = transformer.named_steps['onehot']
                encoded_features = encoder.get_feature_names_out(features)
                feature_names.extend(encoded_features)
            else:
                feature_names.extend(features)
        
        # Get feature importance
        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        elif hasattr(model, 'coef_'):
            importances = np.abs(model.coef_[0])
        else:
            importances = None
        
        if importances is not None and len(importances) == len(feature_names):
            # Plot feature importance
            plt.figure(figsize=(10, 8))
            indices = np.argsort(importances)[-20:]  # Top 20 features
            plt.barh(range(len(indices)), importances[indices])
            plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
            plt.xlabel('Feature Importance')
            plt.title(f'Top 20 Feature Importance - {model_name}')
            plt.tight_layout()
            plt.show()
    
    return {
        'model': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'tp': tp,
        'fp': fp,
        'tn': tn,
        'fn': fn,
        'pipeline': pipeline
    }

In [None]:
# Evaluate each model for Model 1
model1_results = []

for name, model in models1.items():
    print(f"\n\nEvaluating {name} for Model 1 (Pre-Call)...")
    try:
        result = evaluate_model(name, model, X1_train, X1_test, y1_train, y1_test, preprocessor)
        model1_results.append(result)
    except Exception as e:
        print(f"Error evaluating {name}: {str(e)}")

In [None]:
# Create a summary DataFrame for Model 1 results
model1_summary = pd.DataFrame(model1_results)
model1_summary = model1_summary.drop('pipeline', axis=1)  # Drop pipeline object for display
model1_summary = model1_summary.sort_values('f1', ascending=False)  # Sort by F1 score
model1_summary

In [None]:
# Visualize model performance for Model 1
plt.figure(figsize=(12, 6))
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
model1_summary_plot = model1_summary.sort_values('f1', ascending=False).head(5)  # Top 5 models

for metric in metrics:
    plt.figure(figsize=(10, 6))
    sns.barplot(x='model', y=metric, data=model1_summary_plot)
    plt.title(f'{metric.upper()} Comparison - Model 1 (Pre-Call)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 3. Model 2: Post-Call Prediction

This model will help predict which customers to focus on after initial contact, using all available features including campaign-related ones.

In [None]:
# Create dataset for Model 2
model2_data = df[model2_features + ['y']].copy()
model2_data.head()

In [None]:
# Prepare data for Model 2
# Convert target to binary
model2_data['y_binary'] = model2_data['y'].map({'yes': 1, 'no': 0})

# Split features and target
X2 = model2_data.drop(['y', 'y_binary'], axis=1)
y2 = model2_data['y_binary']

# Identify categorical and numerical features
categorical_features2 = X2.select_dtypes(include=['object']).columns.tolist()
numerical_features2 = X2.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical features: {categorical_features2}")
print(f"Numerical features: {numerical_features2}")

# Create preprocessing pipeline
categorical_transformer2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor2 = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer2, numerical_features2),
        ('cat', categorical_transformer2, categorical_features2)
    ])

# Split data into train and test sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42, stratify=y2)

In [None]:
# Define models to evaluate for Model 2
models2 = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier()
}

### Evaluate Models for Post-Call Prediction

In [None]:
# Evaluate each model for Model 2
model2_results = []

for name, model in models2.items():
    print(f"\n\nEvaluating {name} for Model 2 (Post-Call)...")
    try:
        result = evaluate_model(name, model, X2_train, X2_test, y2_train, y2_test, preprocessor2)
        model2_results.append(result)
    except Exception as e:
        print(f"Error evaluating {name}: {str(e)}")

In [None]:
# Create a summary DataFrame for Model 2 results
model2_summary = pd.DataFrame(model2_results)
model2_summary = model2_summary.drop('pipeline', axis=1)  # Drop pipeline object for display
model2_summary = model2_summary.sort_values('f1', ascending=False)  # Sort by F1 score
model2_summary

In [None]:
# Visualize model performance for Model 2
plt.figure(figsize=(12, 6))
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
model2_summary_plot = model2_summary.sort_values('f1', ascending=False).head(5)  # Top 5 models

for metric in metrics:
    plt.figure(figsize=(10, 6))
    sns.barplot(x='model', y=metric, data=model2_summary_plot)
    plt.title(f'{metric.upper()} Comparison - Model 2 (Post-Call)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 4. Compare Model 1 and Model 2 Performance

In [None]:
# Get the top 3 models for each scenario
model1_top3 = model1_summary.head(3).reset_index(drop=True)
model2_top3 = model2_summary.head(3).reset_index(drop=True)

# Add model type identifier
model1_top3['model_type'] = 'Pre-Call (Model 1)'
model2_top3['model_type'] = 'Post-Call (Model 2)'

# Combine results from both models
top_models = pd.concat([model1_top3, model2_top3])
top_models

In [None]:
# Visualize performance comparison of top models
plt.figure(figsize=(14, 10))
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

for i, metric in enumerate(metrics):
    plt.figure(figsize=(12, 6))
    sns.barplot(x='model', y=metric, hue='model_type', data=top_models)
    plt.title(f'{metric.upper()} Comparison - Top Models')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Model Type')
    plt.tight_layout()
    plt.show()

## 5. Final Model Selection and Observations

### Model 1 (Pre-Call) - Best Model Analysis

The best model for pre-call prediction is used to determine which customers to contact before making any calls. This model excludes campaign-related features like duration, day, month, and campaign.

In [None]:
# Get the best model for Model 1
best_model1_name = model1_summary.iloc[0]['model']
best_model1_pipeline = next(result['pipeline'] for result in model1_results if result['model'] == best_model1_name)
print(f"Best Pre-Call Model: {best_model1_name}")
print(f"Accuracy: {model1_summary.iloc[0]['accuracy']:.4f}")
print(f"F1 Score: {model1_summary.iloc[0]['f1']:.4f}")
print(f"ROC AUC: {model1_summary.iloc[0]['roc_auc']:.4f}")

# Get feature importance if available
model = best_model1_pipeline.named_steps['classifier']
if hasattr(model, 'feature_importances_') or hasattr(model, 'coef_'):
    print("\nTop features for this model:")
    # This would need to be implemented based on the specific model type
    # We already show feature importance in the evaluation function

### Model 2 (Post-Call) - Best Model Analysis

The best model for post-call prediction is used to determine which customers to focus on after initial contact. This model includes all features, including campaign-related ones.

In [None]:
# Get the best model for Model 2
best_model2_name = model2_summary.iloc[0]['model']
best_model2_pipeline = next(result['pipeline'] for result in model2_results if result['model'] == best_model2_name)
print(f"Best Post-Call Model: {best_model2_name}")
print(f"Accuracy: {model2_summary.iloc[0]['accuracy']:.4f}")
print(f"F1 Score: {model2_summary.iloc[0]['f1']:.4f}")
print(f"ROC AUC: {model2_summary.iloc[0]['roc_auc']:.4f}")

# Get feature importance if available
model = best_model2_pipeline.named_steps['classifier']
if hasattr(model, 'feature_importances_') or hasattr(model, 'coef_'):
    print("\nTop features for this model:")
    # This would need to be implemented based on the specific model type
    # We already show feature importance in the evaluation function

## 6. Compare Performance Improvement

Let's analyze how much the inclusion of campaign-related features improves prediction performance.

In [None]:
# Compare the best models from each scenario
best_model1_metrics = model1_summary.iloc[0]
best_model2_metrics = model2_summary.iloc[0]

# Calculate improvement percentages
metrics_to_compare = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
improvements = {}

for metric in metrics_to_compare:
    model1_value = best_model1_metrics[metric]
    model2_value = best_model2_metrics[metric]
    improvement = ((model2_value - model1_value) / model1_value) * 100
    improvements[metric] = improvement

# Create a DataFrame for visualization
improvement_df = pd.DataFrame({
    'Metric': metrics_to_compare,
    'Model 1 (Pre-Call)': [best_model1_metrics[m] for m in metrics_to_compare],
    'Model 2 (Post-Call)': [best_model2_metrics[m] for m in metrics_to_compare],
    'Improvement (%)': [improvements[m] for m in metrics_to_compare]
})

improvement_df

In [None]:
# Visualize the improvement
plt.figure(figsize=(10, 6))
sns.barplot(x='Metric', y='Improvement (%)', data=improvement_df)
plt.title('Performance Improvement: Post-Call vs. Pre-Call Model')
plt.axhline(y=0, color='r', linestyle='--')
plt.ylabel('Improvement (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 7. Final Observations and Recommendations

### Pre-Call Model (Model 1)
- This model helps identify which customers to contact before making any calls
- Key features influencing this model are demographic and financial indicators
- The model achieves [accuracy] which [meets/doesn't meet] the target of 75-80%
- Recommendations for using this model: [to be filled after running]

### Post-Call Model (Model 2)
- This model helps identify which customers to focus on after initial contact
- Including campaign-related features [improves/doesn't improve] prediction accuracy significantly
- The most important feature is likely 'duration' which indicates customer interest
- Recommendations for using this model: [to be filled after running]

### Overall Strategy
- Use Model 1 for initial customer targeting to optimize call center resources
- Use Model 2 for follow-up prioritization after initial contact
- The combination of both models creates an efficient two-stage approach to maximize term deposit subscriptions
- [Additional insights based on model performance]