#DATASET

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Number of customers to generate
n_customers = 1000

# Generate customer IDs
customer_ids = range(1001, 1001 + n_customers)

# Generate realistic customer data
def generate_customer_data():
    data = {
        'customer_id': customer_ids,
        'months_as_customer': np.random.randint(1, 60, n_customers),
        'total_purchases': np.random.randint(1, 100, n_customers),
        'avg_order_value': np.random.normal(150, 50, n_customers).round(2),
        'support_tickets': np.random.poisson(2, n_customers),
        'last_interaction_days': np.random.randint(1, 365, n_customers)
    }

    # Create correlations to make the data more realistic
    # Customers with higher months_as_customer tend to have more total_purchases
    data['total_purchases'] = (data['total_purchases'] * (data['months_as_customer'] / 30)).astype(int)

    # Calculate churn probability based on features
    churn_prob = (
        -0.1 * np.log(data['months_as_customer']) +  # Longer-term customers are less likely to churn
        0.03 * data['last_interaction_days'] +        # More recent interaction means less likely to churn
        -0.02 * data['total_purchases'] +             # More purchases means less likely to churn
        0.01 * (data['support_tickets'] ** 2) +       # More support tickets means more likely to churn
        np.random.normal(0, 0.1, n_customers)         # Add some randomness
    )

    # Normalize probabilities
    churn_prob = (churn_prob - churn_prob.min()) / (churn_prob.max() - churn_prob.min())

    # Convert to binary churn indicator
    data['churn'] = (churn_prob > 0.7).astype(int)

    # Clean up the data
    # Ensure avg_order_value is positive
    data['avg_order_value'] = np.abs(data['avg_order_value'])

    # Round average order values to 2 decimal places
    data['avg_order_value'] = data['avg_order_value'].round(2)

    # Create DataFrame
    df = pd.DataFrame(data)

    # Add some missing values to make it more realistic
    mask = np.random.random(n_customers) < 0.05
    df.loc[mask, 'avg_order_value'] = np.nan

    return df

# Generate the dataset
df = generate_customer_data()

# Save to CSV
df.to_csv('customer_churn_data.csv', index=False)

# Print summary statistics
print("\nDataset Summary:")
print("-" * 50)
print(f"Total number of customers: {len(df)}")
print(f"Churn rate: {(df['churn'].mean() * 100).round(2)}%")
print("\nFeature Statistics:")
print("-" * 50)
print(df.describe().round(2))

# Print first few rows
print("\nFirst few rows of the dataset:")
print("-" * 50)
print(df.head())



Dataset Summary:
--------------------------------------------------
Total number of customers: 1000
Churn rate: 26.4%

Feature Statistics:
--------------------------------------------------
       customer_id  months_as_customer  total_purchases  avg_order_value  \
count      1000.00             1000.00          1000.00           941.00   
mean       1500.50               30.58            50.89           149.85   
std         288.82               17.02            44.53            48.78   
min        1001.00                1.00             0.00             5.59   
25%        1250.75               16.00            14.00           116.53   
50%        1500.50               31.00            39.00           150.46   
75%        1750.25               45.00            78.00           182.10   
max        2000.00               59.00           182.00           298.25   

       support_tickets  last_interaction_days    churn  
count          1000.00                1000.00  1000.00  
mean      

#DATA ANALYSIS AND PREPROCESSING
Load and Explore Data:

Read the customer_churn_data.csv file and explore it for insights like missing values, outliers, and distributions of features.

Handle Missing Values:

Decide on an imputation strategy for missing values in avg_order_value (e.g., mean/median imputation or model-based imputation).

Feature Engineering:

Scale numerical features or encode categorical ones.

In [10]:
from sklearn.preprocessing import StandardScaler

def load_and_explore_data(file_path):
    # Load dataset
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.\n")
    print("Initial dataset overview:\n", df.head())
    print("\nMissing values:\n", df.isnull().sum())
    print("\nStatistical Summary:\n", df.describe())
    return df

def preprocess_data(df):
    # Handle missing values
    df['avg_order_value'] = df['avg_order_value'].fillna(df['avg_order_value'].median())


    # Feature-target split
    X = df.drop(columns=['customer_id', 'churn'])
    y = df['churn']

    # Standardize numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y


##MODEL
Model Selection:

Choose a classification model like Random Forest, Logistic Regression, or Gradient Boosting based on interpretability and performance.

Evaluation Metrics:

Use metrics like ROC-AUC, confusion matrix etc to evaluate the model.

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

def build_model(X_train, X_test, y_train, y_test):
    # Initialize and train the model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    print("\nModel Performance:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"ROC-AUC: {auc:.2f}")

    return model


#BUSINESS INSIGHTS
Feature Importance:

Use feature importance metrics to identify key drivers of churn.

Insights and Recommendations:

Identify customers at high risk of churn.


In [12]:
def generate_insights(model, df, features):
    # Feature importance analysis
    importance = model.feature_importances_
    feature_importance = pd.DataFrame({
        'Feature': features,
        'Importance': importance
    }).sort_values(by='Importance', ascending=False)

    print("\nFeature Importance:\n", feature_importance)

    # Business insights
    insights = {
        "Key Factors": feature_importance['Feature'].head(5).tolist(),
        "Recommendations": [
            "Engage high-risk customers with personalized offers.",
            "Improve response time for support tickets.",
            "Target frequent customers for loyalty programs."
        ]
    }
    return insights


#FLOW BEFORE CROSS VALIDATION

In [13]:
# Step 1: Load and explore the data
file_path = 'customer_churn_data.csv'
df = load_and_explore_data(file_path)

# Step 2: Preprocess data
X, y = preprocess_data(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Build and evaluate the model
model = build_model(X_train, X_test, y_train, y_test)

# Step 4: Generate business insights
features = df.drop(columns=['customer_id', 'churn']).columns
insights = generate_insights(model, df, features)

print("\nBusiness Insights:\n", insights)

Dataset loaded successfully.

Initial dataset overview:
    customer_id  months_as_customer  total_purchases  avg_order_value  \
0         1001                  39               29           168.36   
1         1002                  52              136           123.71   
2         1003                  29               82           143.45   
3         1004                  15                7            48.24   
4         1005                  43               94           186.61   

   support_tickets  last_interaction_days  churn  
0                1                    108      0  
1                2                    296      0  
2                5                    324      1  
3                1                    302      1  
4                2                    334      1  

Missing values:
 customer_id               0
months_as_customer        0
total_purchases           0
avg_order_value          59
support_tickets           0
last_interaction_days     0
churn             

##K-FOLD CROSS VALIDATION

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

def perform_cross_validation(X, y, model, cv=5):
    """
    Perform k-fold cross-validation.

    Parameters:
    - X: Features
    - y: Target labels
    - model: Machine learning model
    - cv: Number of folds (default: 5)

    Returns:
    - mean_score: Average ROC-AUC score across folds
    - all_scores: List of individual fold scores
    """
    # Perform cross-validation
    all_scores = cross_val_score(
        model, X, y, cv=cv, scoring='roc_auc', n_jobs=-1
    )

    # Print the results
    print(f"Cross-Validation Scores (ROC-AUC): {all_scores}")
    print(f"Mean ROC-AUC Score: {np.mean(all_scores):.4f}")
    return np.mean(all_scores), all_scores


##FLOW AFTER CROSS VALIDATION

In [15]:
# Load and preprocess data
file_path = 'customer_churn_data.csv'
df = load_and_explore_data(file_path)
X, y = preprocess_data(df)

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Perform cross-validation
print("\nPerforming K-Fold Cross-Validation...")
mean_score, all_scores = perform_cross_validation(X, y, model, cv=5)

Dataset loaded successfully.

Initial dataset overview:
    customer_id  months_as_customer  total_purchases  avg_order_value  \
0         1001                  39               29           168.36   
1         1002                  52              136           123.71   
2         1003                  29               82           143.45   
3         1004                  15                7            48.24   
4         1005                  43               94           186.61   

   support_tickets  last_interaction_days  churn  
0                1                    108      0  
1                2                    296      0  
2                5                    324      1  
3                1                    302      1  
4                2                    334      1  

Missing values:
 customer_id               0
months_as_customer        0
total_purchases           0
avg_order_value          59
support_tickets           0
last_interaction_days     0
churn             

In [None]:
# Set up Git (first time setup)
!git config --global user.name "lolipie007"
!git config --global user.email "pratik.kartik2003@gmail.com"

