In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score,accuracy_score,make_scorer,precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

In [28]:
df=pd.read_csv("combined.csv",encoding="ISO-8859-1").copy()

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 505206 entries, 0 to 505206
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   CustomerID         505206 non-null  float64
 1   Age                505206 non-null  float64
 2   Gender             505206 non-null  object 
 3   Tenure             505206 non-null  float64
 4   Usage Frequency    505206 non-null  float64
 5   Support Calls      505206 non-null  float64
 6   Payment Delay      505206 non-null  float64
 7   Subscription Type  505206 non-null  object 
 8   Contract Length    505206 non-null  object 
 9   Total Spend        505206 non-null  float64
 10  Last Interaction   505206 non-null  float64
 11  Churn              505206 non-null  float64
dtypes: float64(9), object(3)
memory usage: 50.1+ MB


In [29]:
df.shape

(505207, 12)

In [30]:
df[df.isna().any(axis=1)]

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
263669,,,,,,,,,,,,


In [31]:
df.dropna(inplace=True)

In [32]:
df.shape

(505206, 12)

In [33]:
df[df.isna().any(axis=1)]

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn


In [35]:
X=df.drop(columns='Churn')
df['Churn']=(df['Churn']==1)
y=df['Churn']

In [36]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2,stratify=y)

In [37]:
categorcal=df.select_dtypes(include='object').columns.tolist()
numerical=df.select_dtypes(include=['int64','float64']).columns.tolist()

In [38]:

preprocessor=ColumnTransformer(
    transformers=[
        ('one_hot_encoder',OneHotEncoder(handle_unknown='ignore',sparse_output=False),categorcal),
        ('QuantileTransformer',QuantileTransformer(output_distribution='normal'),numerical)
    ]
)


In [None]:
lr=LogisticRegression(random_state=42)
lr_pipe=Pipeline(
    steps=[
        ("preprocessor",preprocessor),
        ("model",lr)
    ]
)

param_dist={
    'model__max_iter':[100,200,300,400,500,600,700,800,900,1000],
    'model__solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
}
lr_search=RandomizedSearchCV(
    estimator=lr_pipe,
    param_distributions=param_dist,
    cv=3,
    n_iter=20,
    n_jobs=-1,
    scoring='f1'
)
lr_search.fit(X_train,y_train)
y_pred=lr_search.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.86      0.84      0.85     44943
        True       0.87      0.89      0.88     56099

    accuracy                           0.87    101042
   macro avg       0.87      0.86      0.86    101042
weighted avg       0.87      0.87      0.87    101042



In [None]:
y_proba=lr_pipe.predict_proba(X_test)[:,1]
roc_auc=roc_auc_score(y_test,y_proba)
con_matrix=confusion_matrix(y_test,y_pred)
print("roc-auc score",roc_auc)
print("confusion matrix",con_matrix)

NameError: name 'lr_pipe' is not defined

In [40]:
lr_pipe.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('one_hot_encoder',
                                    OneHotEncoder(handle_unknown='ignore',
                                                  sparse_output=False),
                                    ['Gender', 'Subscription Type',
                                     'Contract Length']),
                                   ('QuantileTransformer',
                                    QuantileTransformer(output_distribution='normal'),
                                    ['CustomerID', 'Age', 'Tenure',
                                     'Usage Frequency', 'Support Calls',
                                     'Payment Delay', 'Total Spend',
                                     'Last Interaction'])])),
  ('model', LogisticRegression(random_state=42))],
 'transform_input': None,
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('one_hot_encoder',
                                  OneHotEnco

In [None]:
lr_search.best_params_

{'model__solver': 'newton-cg', 'model__max_iter': 300}

In [None]:
lr_search.best_estimator_

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('one_hot_encoder', ...), ('QuantileTransformer', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_quantiles,1000
,output_distribution,'normal'
,ignore_implicit_zeros,False
,subsample,10000
,random_state,
,copy,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'newton-cg'
,max_iter,300


In [None]:
top_models=pd.DataFrame(lr_search.cv_results_).sort_values(by='rank_test_score')
display(top_models.head(10))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__solver,param_model__max_iter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
4,3.662228,0.90628,1.047493,0.072341,newton-cg,300,"{'model__solver': 'newton-cg', 'model__max_ite...",0.880747,0.882245,0.881363,0.881452,0.000615,1
19,3.203338,0.058071,0.689732,0.041027,liblinear,200,"{'model__solver': 'liblinear', 'model__max_ite...",0.880284,0.882495,0.881562,0.881447,0.000906,2
10,125.112008,3.245356,0.647087,0.018069,saga,600,"{'model__solver': 'saga', 'model__max_iter': 600}",0.880436,0.882903,0.880562,0.8813,0.001134,3
18,2.503727,0.020099,0.750291,0.114729,newton-cg,200,"{'model__solver': 'newton-cg', 'model__max_ite...",0.88134,0.88227,0.880263,0.881291,0.00082,4
11,126.667519,4.394937,0.841917,0.205367,saga,700,"{'model__solver': 'saga', 'model__max_iter': 700}",0.880476,0.880723,0.882195,0.881131,0.000759,5
8,3.353006,0.233891,0.759051,0.133611,newton-cg,700,"{'model__solver': 'newton-cg', 'model__max_ite...",0.88004,0.882074,0.881268,0.881127,0.000836,6
16,1.937119,0.239139,0.795158,0.063809,lbfgs,800,"{'model__solver': 'lbfgs', 'model__max_iter': ...",0.879717,0.882161,0.881447,0.881108,0.001026,7
9,2.03197,0.201667,0.766795,0.166557,lbfgs,700,"{'model__solver': 'lbfgs', 'model__max_iter': ...",0.880603,0.881995,0.880479,0.881026,0.000687,8
13,111.728713,4.106237,0.667193,0.037598,saga,300,"{'model__solver': 'saga', 'model__max_iter': 300}",0.880241,0.882756,0.879947,0.880981,0.001261,9
3,4.680652,0.434423,1.112591,0.261104,liblinear,300,"{'model__solver': 'liblinear', 'model__max_ite...",0.880141,0.882389,0.880374,0.880968,0.00101,10


In [None]:
best_lr=lr_search.best_estimator_.named_steps['model']
encoder=lr_search.best_estimator_.named_steps['encoder']
features=encoder.get_features_names_out()
coef=best_lr.coef_[0]

In [None]:
rf_model=RandomForestClassifier()
rf_pipeline=Pipeline(
    ("preprocessor",preprocessor),
    ("model",rf_model)
)
rf_search=RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=param_dist,
    cv=3,
    n_iter=20,
    n_jobs=-1,
    scoring='f1'
)
rf_search.fit(X_train,y_train)
y_pred=rf_search.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
y_proba=rf_grid.predict_proba(X_test)[:,1]
roc_auc=roc_auc_score(y_test,y_proba)
con_matrix=confusion_matrix(y_test,y_pred)
print("roc-auc score",roc_auc)
print("confusion matrix",con_matrix)

In [None]:
rf_search.best_estimator_

In [None]:
rf_search.best_params_

In [None]:
best_rf=rf_search.best_estimator_.named_steps['model']
encoder=rf_search.best_estimator_.named_steps['encoder']
feature_names=encoder.get_feature_names_out()
feature_importance=best_rf.feature_importances_

In [None]:
xg_model=XGBClassifier()
xg_pipe=Pipeline(
    ("preprocessor",preprocessor),
    ("model",xg_model)
)
xg_search=RandomizedSearchCV(
    estimator=xg_pipe,
    param_distributions=param_dist,
    cv=3,
    n_iter=20,
    n_jobs=-1,
    scoring='f1'
)
xg_search.fit(X_train,y_train)
y_pred=xg_search.predict(X_test)
print(classification_report((y_test,y_pred)))

In [None]:
y_proba=xg_search.predict_proba(X_test)[:,1]
roc_auc=roc_auc_score(y_test,y_proba)
con_matrix=confusion_matrix(y_test,y_pred)
print("roc-auc score",roc_auc)
print("confusion matrix",con_matrix)

In [None]:
best_xg=xg_search.best_estimator_.named_steps['model']
features=best_xg.feature_importances_

In [None]:
import joblib
import json
from datetime import datetime

joblib.dump(xg_search.best_estimator_,'customer churn prediction/model_and_dev/model_v1.pkl')
joblib.dump(preprocessor,'customer churn prediction/model_and_dev/preprocessor.pkl')
metadata={
    "model_version": "v1",
    "trained_on": "customer_churn_dataset",
    "metric": "ROC-AUC",
    "score": 0.75,
    "trained_at": datetime.now().isoformat()
}


with open("customer churn prediction/models and dev/metadata.json","w") as f:
    
    json.dump(metadata,f)

