In [1]:
import os
os.environ["PYCARET_CUSTOM_LOGGING_LEVEL"] = "CRITICAL"

In [2]:
# import libraries
import pandas as pd
import numpy as np

In [None]:
# read csv data
data = pd.read_csv('https://raw.githubusercontent.com/srees1988/predict-churn-py/main/customer_churn_data.csv')

In [3]:
# check the data types
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [4]:
# replace blanks with np.nan
data['TotalCharges'] = data['TotalCharges'].replace(' ', np.nan).astype('float64')

In [5]:
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [6]:
# check missing values
data.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [8]:
# initialize setup
from pycaret.classification import *
s = setup(data, target = 'Churn', session_id = 123, ignore_features = ['customerID'], log_experiment = True, experiment_name = 'churn1')

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Churn
2,Target Type,Binary
3,Label Encoded,"No: 0, Yes: 1"
4,Original Data,"(7043, 21)"
5,Missing Values,True
6,Numeric Features,3
7,Categorical Features,16
8,Ordinal Features,False
9,High Cardinality Features,False


In [9]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.8067,0.8479,0.5254,0.6617,0.5849,0.4612,0.4669,0.065
ridge,Ridge Classifier,0.8063,0.0,0.5097,0.6657,0.5767,0.4541,0.4613,0.014
lr,Logistic Regression,0.8045,0.8446,0.5332,0.6499,0.5853,0.4591,0.4633,0.823
gbc,Gradient Boosting Classifier,0.8041,0.8472,0.5105,0.659,0.5744,0.4499,0.4567,0.175
lda,Linear Discriminant Analysis,0.8008,0.8375,0.548,0.6349,0.5876,0.4574,0.4599,0.01
lightgbm,Light Gradient Boosting Machine,0.8004,0.8372,0.5238,0.6426,0.5767,0.4479,0.4522,0.328
catboost,CatBoost Classifier,0.8,0.8431,0.5051,0.6479,0.5672,0.4398,0.4458,1.692
rf,Random Forest Classifier,0.7905,0.8177,0.4738,0.6287,0.54,0.4078,0.4149,0.139
xgboost,Extreme Gradient Boosting,0.7901,0.8231,0.5105,0.615,0.5572,0.4214,0.4249,0.374
et,Extra Trees Classifier,0.7744,0.7866,0.4597,0.5841,0.5138,0.3698,0.3747,0.148


In [10]:
print(best_model)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=123)


In [12]:
# save pipeline
save_model(best_model, 'my_first_pipeline')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['customerID'],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='Churn',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 nu...
                 ('dummy', Dummify(target='Churn')),
                 ('fix_perfect', Remove_100(target='Churn')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi',

In [14]:
from sklearn import set_config
set_config(display = 'diagram')

In [15]:
load_model('my_first_pipeline')

Transformation Pipeline and Model Successfully Loaded


In [16]:
!mlflow ui

^C
