In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import wasserstein_distance
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report
from custom_transformers import Winsorizer, TopNCategories, DistributionPreservingImputer
import datetime as dt

In [3]:
df = pd.read_csv("C:\\Users\\pgp09\\OneDrive\\Documents\\Desktop\\Data Science\\CampusX_ML Topics\\Datasets\\real_datasets\\churn_predictor.csv")
df.head(2)

Unnamed: 0,monthly_spend_inr,transactions_last_90d,avg_session_duration_min,support_tickets_last_year,city,preferred_payment_method,referral_source,tenure_bucket,satisfaction_level,risk_segment,signup_date,last_activity_date,churn_status
0,,9,44.5,0,Delhi,UPI,Organic,3y+,High,Medium,2020-06-28,2025-07-08,active
1,521.14,10,27.05,0,Bengaluru,Credit Card,Twitter/X,,Medium,Medium,2024-04-03,2025-07-14,active


In [5]:
dt_cols = ['signup_date', 'last_activity_date']
df[dt_cols] = df[dt_cols].apply(pd.to_datetime, errors='coerce')
df['today_dt'] = pd.Timestamp.today()
df['account_vintage'] = (df['today_dt'] - df['signup_date']).dt.days
df['activity_vintage'] = (df['today_dt'] - df['last_activity_date']).dt.days
num_cols = ['monthly_spend_inr', 'transactions_last_90d', 'avg_session_duration_min', 'support_tickets_last_year']
num_cols_winsorizer = ['monthly_spend_inr', 'transactions_last_90d', 'avg_session_duration_min']
num_cols_nonwinsorizer = ['support_tickets_last_year']
nom_cols = ['city', 'preferred_payment_method', 'referral_source']
ord_cols = ['tenure_bucket', 'satisfaction_level', 'risk_segment']
derived_cols = ['account_vintage', 'activity_vintage']
categories = [
    ['missing', '1-3m', '3-6m', '6-12m', '1-2y', '2-3y', '3y+'], 
    ['missing', 'Very Low', 'Low', 'Medium', 'High', 'Very High'],
    ['missing', 'Low', 'Medium', 'High']
] 

y = df['churn_status']
del df['churn_status']
X = df
y = y.apply(lambda x:1 if x=='churned' else 0)

In [7]:
# defining intermediate pipes
pipe_num_winsorizer = Pipeline(
    [
    ('step1', Winsorizer(columns=num_cols_winsorizer)),
    ('step2', DistributionPreservingImputer()),
    ('step3', PowerTransformer(standardize=True))
    ])

pipe_num_nonwinsorizer = Pipeline(
    [('step1', DistributionPreservingImputer()),
    ('step2', PowerTransformer(standardize=True))
    ]
) 

pipe_nom_cols = Pipeline(
    [
        ('step1', TopNCategories()), 
        ('step2', SimpleImputer(strategy='constant', fill_value='missing')),
        ('step3', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'))
    ]
    ) 

pipe_ord_cols = Pipeline(
    [
        ('step1', SimpleImputer(strategy='constant', fill_value='missing')), 
        ('step2', OrdinalEncoder(categories=categories))
    ]
)

pipe_derived_cols = Pipeline(
    [('step1', DistributionPreservingImputer()),
    ('step2', PowerTransformer(standardize=True))
    ]
) 


In [9]:
# defining column transformers
col_trf = ColumnTransformer(
    transformers= [
        ('trf1', pipe_num_winsorizer, num_cols_winsorizer),
        ('trf2', pipe_num_nonwinsorizer, num_cols_nonwinsorizer),
        ('trf3', pipe_nom_cols, nom_cols), 
        ('trf4', pipe_ord_cols, ord_cols), 
        ('trf5', pipe_derived_cols, derived_cols)
    ], 
    remainder = 'drop'
)  

pipe = Pipeline(
    [
        ('step1', col_trf),
        ('clf', DecisionTreeClassifier())  # placeholder algorithm
     ]
) 

In [11]:
# defining param grid 
param_grid = [
    {
        'clf': [DecisionTreeClassifier()],
        'clf__max_depth' : [5, 10],
        'clf__min_impurity_decrease' : [0.02, 0.04]
    },
    {
        'clf': [RandomForestClassifier()],
        'clf__n_estimators' : [100, 80],
        'clf__bootstrap' : [True, False]
    }, 
    {
        'clf': [GradientBoostingClassifier()],
        'clf__learning_rate' : [0.1, 0.15, 0.2],
        'clf__subsample' : [0.8, 0.75]
    }, 
    {
        'clf': [XGBClassifier()],
        'clf__tree_method' : ['hist', 'approx'],
        'clf__colsample_bytree' : [0.9, 0.8]
    }
    ]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) 

In [15]:
pipe

In [17]:
# Doing GridSearchCV
gscv = GridSearchCV(
    estimator = pipe, 
    param_grid = param_grid, 
    scoring = 'accuracy', 
    cv = StratifiedKFold(n_splits=5, shuffle=True), 
    n_jobs = -1, 
    verbose = 1, 
    error_score='raise'
)

# training 
gscv.fit(X_train, y_train) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [19]:
gscv.best_params_

{'clf': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               feature_weights=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=None, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=None,
               n_jobs=None, num_parallel_tree=None, ...),
 'clf__colsample_bytree': 0.9,
 'clf__tree_method': 'hist'}

In [23]:
gscv.best_score_

0.9730000000000001

In [27]:
pipe2 = best_pipe = gscv.best_estimator_

In [29]:
import joblib
joblib.dump(pipe2, 'model_pipeline2.pkl')

['model_pipeline2.pkl']