<a href="https://colab.research.google.com/github/nadeeshafdo/google_colab/blob/main/PycaretTut.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install pycaret
!pip install pycaret



In [2]:
#Import Data manipulation libraries
import pandas as pd
import numpy as np

# Expand truncated values in pandas data frame
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

#Load your dataset
dataset = pd.read_csv('/content/Telco-Customer-Churn2.csv')
dataset.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,No,Yes,No,1.0,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,No,No,No,34.0,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,No,No,No,2.0,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,No,No,No,45.0,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,No,No,No,2.0,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
#Check dataset dimensions
dataset.shape

(7043, 21)

In [8]:
# Investigate the portion of missing data values in each feature
dataset.isnull().sum()/len(dataset)*100

Unnamed: 0,0
customerID,0.0
gender,0.0
SeniorCitizen,0.0
Partner,0.0
Dependents,0.0
tenure,0.099389
PhoneService,0.042595
MultipleLines,0.042595
InternetService,0.0
OnlineSecurity,0.0


In [13]:
# Visualise churn in the three types of contracts
import plotly.express as px
fig = px.scatter(x=dataset['tenure'], y=dataset['TotalCharges'],
                 color = dataset['Churn'], template = 'presentation',
                 opacity = 0.5, facet_col = dataset['Contract'],
                 title = 'Customer Churn by Tenure, Charges, and Contract Type',
                 labels = {'x' : 'Customer Tenure', 'y' : 'Total Charges $'})
fig.show()

In [10]:
# Visualise churn in the three types of contracts
import plotly.express as px
fig = px.scatter(x=dataset['tenure'], y=dataset['MonthlyCharges'],
                 color = dataset['Churn'], template = 'presentation',
                 opacity = 0.5, facet_col = dataset['Contract'],
                 title = 'Customer Churn by Tenure, Monthly Charges & Contract Type',
                 labels = {'x' : 'Customer Tenure', 'y' : 'MonthlyCharges $'})
fig.show()


In [14]:
data = dataset.sample(frac=0.9, random_state=786)
data_unseen = dataset.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))


Data for Modeling: (6339, 21)
Unseen Data For Predictions: (704, 21)


In [15]:
from pycaret.classification import *
telecom = setup(data,target = 'Churn',
                max_encoding_ohe = 100,
                fold_strategy = 'kfold',
                fold = 5,
                data_split_stratify = True ,
                transformation = False,
                train_size = 0.8,
                ignore_features = ['customerID'],
                ordinal_features = {'Contract' : ['Month-to-month' ,'One year', 'Two year']},
                normalize = True,
                normalize_method ='minmax',
                low_variance_threshold = 0.1,
                numeric_imputation ='mean',
                categorical_imputation='mode',
                session_id = 43)


Unnamed: 0,Description,Value
0,Session id,43
1,Target,Churn
2,Target type,Binary
3,Target mapping,"No: 0, Yes: 1"
4,Original data shape,"(6339, 21)"
5,Transformed data shape,"(6339, 39)"
6,Transformed train set shape,"(5071, 39)"
7,Transformed test set shape,"(1268, 39)"
8,Ignore features,1
9,Ordinal features,1


In [16]:
# To get a list of all possible checks on configurations
get_config()


{'USI',
 'X',
 'X_test',
 'X_test_transformed',
 'X_train',
 'X_train_transformed',
 'X_transformed',
 '_available_plots',
 '_ml_usecase',
 'data',
 'dataset',
 'dataset_transformed',
 'exp_id',
 'exp_name_log',
 'fix_imbalance',
 'fold_generator',
 'fold_groups_param',
 'fold_shuffle_param',
 'gpu_n_jobs_param',
 'gpu_param',
 'html_param',
 'idx',
 'is_multiclass',
 'log_plots_param',
 'logging_param',
 'memory',
 'n_jobs_param',
 'pipeline',
 'seed',
 'target_param',
 'test',
 'test_transformed',
 'train',
 'train_transformed',
 'variable_and_property_keys',
 'variables',
 'y',
 'y_test',
 'y_test_transformed',
 'y_train',
 'y_train_transformed',
 'y_transformed'}

In [17]:
# to view the training set X inputs before transformation
get_config('X_train')

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
2790,Male,No,Yes,No,18.0,Yes,No,Fiber optic,No,No,Yes,Yes,No,No,Month-to-month,No,Electronic check,78.199997,1468.750000
6109,Male,No,No,No,1.0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Electronic check,20.650000,20.650000
2585,Male,No,Yes,No,40.0,Yes,No,Fiber optic,No,Yes,Yes,No,No,No,One year,No,Electronic check,80.800003,3132.750000
1181,Male,No,No,No,54.0,Yes,Yes,Fiber optic,No,No,Yes,No,No,Yes,Month-to-month,No,Credit card (automatic),89.400002,4869.500000
565,Female,No,No,No,28.0,Yes,No,DSL,No,Yes,No,Yes,No,No,Month-to-month,No,Bank transfer (automatic),54.650002,1517.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1825,Male,Yes,Yes,Yes,20.0,Yes,No,Fiber optic,Yes,No,Yes,Yes,No,Yes,One year,No,Credit card (automatic),94.300003,1818.300049
4586,Male,No,No,No,36.0,Yes,Yes,Fiber optic,No,No,No,No,No,Yes,Month-to-month,Yes,Electronic check,84.900002,3067.199951
4670,Female,No,Yes,No,72.0,Yes,Yes,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Electronic check,92.400002,6786.100098
4974,Female,Yes,Yes,Yes,23.0,Yes,No,Fiber optic,No,Yes,Yes,No,No,Yes,Month-to-month,Yes,Bank transfer (automatic),88.449997,2130.550049


In [18]:
# To view the training set X inputs after transformation
get_config('X_train_transformed')


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,MultipleLines_No,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,InternetService_DSL,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_Yes,DeviceProtection_No internet service,DeviceProtection_No,TechSupport_Yes,TechSupport_No internet service,TechSupport_No,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_0.0,Contract_1.0,Contract_2.0,PaperlessBilling,PaymentMethod_Electronic check,PaymentMethod_Credit card (automatic),PaymentMethod_Bank transfer (automatic),PaymentMethod_Mailed check,MonthlyCharges,TotalCharges
2790,1.0,0.0,1.0,0.0,0.250000,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.595914,0.167310
6109,1.0,0.0,0.0,0.0,0.013889,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.022422,0.000208
2585,1.0,0.0,1.0,0.0,0.555556,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.621824,0.359326
1181,1.0,0.0,0.0,0.0,0.750000,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.707524,0.559737
565,0.0,0.0,0.0,0.0,0.388889,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.361236,0.172935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1825,1.0,1.0,1.0,1.0,0.277778,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.756353,0.207646
4586,1.0,0.0,0.0,0.0,0.500000,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.662681,0.351762
4670,0.0,0.0,1.0,0.0,1.000000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.737419,0.780901
4974,0.0,1.0,1.0,1.0,0.319444,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.698057,0.243678


In [19]:
# compare all models
best_model = compare_models(sort = 'Accuracy')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8063,0.8399,0.8063,0.7972,0.7983,0.4681,0.4744,1.068
lr,Logistic Regression,0.8006,0.8383,0.8006,0.7918,0.7941,0.4596,0.4633,2.386
ridge,Ridge Classifier,0.7977,0.8316,0.7977,0.7872,0.7893,0.4442,0.4498,0.328
ada,Ada Boost Classifier,0.7977,0.8373,0.7977,0.7886,0.7909,0.4513,0.4552,0.574
lda,Linear Discriminant Analysis,0.7949,0.8314,0.7949,0.7875,0.79,0.4518,0.4538,0.52
lightgbm,Light Gradient Boosting Machine,0.789,0.8234,0.789,0.7792,0.7818,0.4265,0.4305,0.968
svm,SVM - Linear Kernel,0.7874,0.8261,0.7874,0.7844,0.7813,0.4323,0.4404,0.352
rf,Random Forest Classifier,0.7809,0.8137,0.7809,0.7679,0.7703,0.3918,0.3986,0.88
xgboost,Extreme Gradient Boosting,0.7732,0.8074,0.7732,0.7633,0.7664,0.3875,0.3905,0.48
et,Extra Trees Classifier,0.7663,0.7819,0.7663,0.7539,0.7577,0.3625,0.3663,0.848


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [20]:
#To view all available algorithms
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True
