# Customer Churn Prediction - Part 4: Predict New Data

## Overview
This notebook demonstrates how to use the trained model to predict churn for new customer data.

## Step 1: Import Libraries

In [ ]:
import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## Step 2: Load Trained Model and Scaler

In [ ]:
# Load the best model (Random Forest)
model = joblib.load('models/random_forest.pkl')
scaler = joblib.load('models/scaler.pkl')

# Load processed feature names from training data
X_train = pd.read_csv('data/X_train.csv')
feature_names = X_train.columns.tolist()

print("Model and scaler loaded successfully!")
print(f"\nModel type: Random Forest")
print(f"Number of features: {len(feature_names)}")

## Step 3: Preprocessing Function

In [ ]:
def preprocess_new_data(new_customer_data):
    """
    Preprocess new customer data to match training data format
    
    Parameters:
    -----------
    new_customer_data : dict or pd.DataFrame
        New customer data with original column names
    
    Returns:
    --------
    processed_data : pd.DataFrame
        Preprocessed data ready for prediction
    """
    # Convert to DataFrame if dict
    if isinstance(new_customer_data, dict):
        df = pd.DataFrame([new_customer_data])
    else:
        df = new_customer_data.copy()
    
    # Handle missing TotalCharges
    if 'TotalCharges' in df.columns:
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
        df['TotalCharges'].fillna(0, inplace=True)
    
    # Standardize categorical values
    columns_to_fix = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                      'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines']
    for col in columns_to_fix:
        if col in df.columns:
            df[col] = df[col].replace(['No internet service', 'No phone service'], 'No')
    
    # Feature engineering
    if 'tenure' in df.columns and 'TotalCharges' in df.columns:
        df['AvgChargePerMonth'] = df.apply(
            lambda x: x['TotalCharges'] / x['tenure'] if x['tenure'] > 0 else 0, axis=1
        )
    
    if 'tenure' in df.columns:
        def categorize_tenure(tenure):
            if tenure <= 12:
                return '0-12'
            elif tenure <= 24:
                return '13-24'
            elif tenure <= 48:
                return '25-48'
            else:
                return '49+'
        df['TenureGroup'] = df['tenure'].apply(categorize_tenure)
    
    service_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                    'TechSupport', 'StreamingTV', 'StreamingMovies']
    if all(col in df.columns for col in service_cols):
        df['ServiceCount'] = df[service_cols].apply(
            lambda x: sum(x == 'Yes'), axis=1
        )
    
    # Encode categorical variables
    binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling',
                   'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                   'TechSupport', 'StreamingTV', 'StreamingMovies']
    
    for col in binary_cols:
        if col in df.columns:
            df[col] = df[col].map({'Yes': 1, 'No': 0})
    
    if 'gender' in df.columns:
        df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})
    
    if 'MultipleLines' in df.columns:
        df['MultipleLines'] = df['MultipleLines'].map({'Yes': 1, 'No': 0})
    
    # One-hot encoding
    multi_category_cols = ['InternetService', 'Contract', 'PaymentMethod', 'TenureGroup']
    for col in multi_category_cols:
        if col in df.columns:
            df = pd.get_dummies(df, columns=[col], prefix=[col])
    
    # Ensure all training features are present
    for feature in feature_names:
        if feature not in df.columns:
            df[feature] = 0
    
    # Select only features used in training
    df = df[feature_names]
    
    return df

print("Preprocessing function created!")

## Step 4: Prediction Function

In [ ]:
def predict_churn(new_customer_data, model, scaler, return_probability=True):
    """
    Predict churn for new customer data
    
    Parameters:
    -----------
    new_customer_data : dict or pd.DataFrame
        New customer data
    model : trained model
        Trained machine learning model
    scaler : StandardScaler
        Fitted scaler
    return_probability : bool
        Whether to return probability scores
    
    Returns:
    --------
    prediction : int or array
        Churn prediction (0 = No, 1 = Yes)
    probability : float or array (optional)
        Probability of churn
    """
    # Preprocess data
    processed_data = preprocess_new_data(new_customer_data)
    
    # Note: Random Forest doesn't need scaling, but we'll keep it for consistency
    # For tree-based models, scaling is not necessary
    
    # Make prediction
    prediction = model.predict(processed_data)
    
    if return_probability:
        probability = model.predict_proba(processed_data)[:, 1]
        return prediction, probability
    
    return prediction

print("Prediction function created!")

## Step 5: Example Predictions

In [ ]:
# Example 1: High-risk customer (Month-to-month, low tenure, high charges)
example_customer_1 = {
    'gender': 'Male',
    'SeniorCitizen': 0,
    'Partner': 'No',
    'Dependents': 'No',
    'tenure': 2,
    'PhoneService': 'Yes',
    'MultipleLines': 'No',
    'InternetService': 'Fiber optic',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'No',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'Yes',
    'StreamingMovies': 'Yes',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 100.0,
    'TotalCharges': 200.0
}

prediction_1, probability_1 = predict_churn(example_customer_1, model, scaler)

print("Example Customer 1 (High Risk):")
print(f"  Contract: {example_customer_1['Contract']}")
print(f"  Tenure: {example_customer_1['tenure']} months")
print(f"  Monthly Charges: ${example_customer_1['MonthlyCharges']:.2f}")
print(f"  Payment Method: {example_customer_1['PaymentMethod']}")
print(f"\n  Prediction: {'CHURN' if prediction_1[0] == 1 else 'NO CHURN'}")
print(f"  Churn Probability: {probability_1[0]:.2%}")
print(f"  Risk Level: {'HIGH' if probability_1[0] > 0.5 else 'LOW'}")

In [ ]:
# Example 2: Low-risk customer (Two year contract, high tenure)
example_customer_2 = {
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'Yes',
    'tenure': 60,
    'PhoneService': 'Yes',
    'MultipleLines': 'Yes',
    'InternetService': 'DSL',
    'OnlineSecurity': 'Yes',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'Yes',
    'TechSupport': 'Yes',
    'StreamingTV': 'Yes',
    'StreamingMovies': 'Yes',
    'Contract': 'Two year',
    'PaperlessBilling': 'No',
    'PaymentMethod': 'Bank transfer (automatic)',
    'MonthlyCharges': 80.0,
    'TotalCharges': 4800.0
}

prediction_2, probability_2 = predict_churn(example_customer_2, model, scaler)

print("\nExample Customer 2 (Low Risk):")
print(f"  Contract: {example_customer_2['Contract']}")
print(f"  Tenure: {example_customer_2['tenure']} months")
print(f"  Monthly Charges: ${example_customer_2['MonthlyCharges']:.2f}")
print(f"  Payment Method: {example_customer_2['PaymentMethod']}")
print(f"\n  Prediction: {'CHURN' if prediction_2[0] == 1 else 'NO CHURN'}")
print(f"  Churn Probability: {probability_2[0]:.2%}")
print(f"  Risk Level: {'HIGH' if probability_2[0] > 0.5 else 'LOW'}")

## Step 6: Batch Prediction

In [ ]:
# Example: Predict for multiple customers
new_customers = pd.DataFrame([
    example_customer_1,
    example_customer_2
])

predictions, probabilities = predict_churn(new_customers, model, scaler)

# Create results dataframe
results_df = new_customers[['tenure', 'Contract', 'MonthlyCharges', 'PaymentMethod']].copy()
results_df['Churn_Prediction'] = ['CHURN' if p == 1 else 'NO CHURN' for p in predictions]
results_df['Churn_Probability'] = [f"{prob:.2%}" for prob in probabilities]
results_df['Risk_Level'] = ['HIGH' if prob > 0.5 else 'LOW' for prob in probabilities]

print("\nBatch Prediction Results:")
print("="*80)
print(results_df.to_string(index=False))

## Summary

### Key Points:
1. Model successfully loaded and ready for predictions
2. Preprocessing function handles new data automatically
3. Predictions include both binary output and probability scores
4. Can handle single or batch predictions

### Usage:
- Use `predict_churn()` function with new customer data
- Returns prediction (0/1) and probability (0-1)
- High probability (>0.5) indicates high churn risk