In [1]:
# Standard
import pandas as pd
import numpy as np
import os
# Pycaret
from pycaret.classification import *
# Plots
from plotly.offline import iplot
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
# Sklearn tools
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import *
# Extras
from datetime import date
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
os.environ["GIT_PYTHON_REFRESH"] = "quiet"
import git

In [3]:
# Datapath and Setup
os.chdir("..")
PATH = os.getcwd()+os.sep
RANDOM_SEED = 142
K_FOLDS = 5

In [4]:
# Helper functions for structured data
# Get info about the dataset
def dataset_info(dataset, dataset_name: str):
    print(f"Dataset Name: {dataset_name} \
        | Number of Samples: {dataset.shape[0]} \
        | Number of Columns: {dataset.shape[1]}")
    print(30*"=")
    print("Column             Data Type")
    print(dataset.dtypes)
    print(30*"=")
    missing_data = dataset.isnull().sum()
    if sum(missing_data) > 0:
        print(missing_data[missing_data.values > 0])
    else:
        print("No Missing Data on this Dataset!")
    print(30*"=")
    print("Memory Usage: {} MB".
        format(np.round(
        dataset.memory_usage(index=True).sum() / 10e5, 3
    )))

In [5]:
# Dataset Sampling
def data_sampling(dataset, frac: float, random_seed: int):
    data_sampled_a = dataset.sample(frac=frac, random_state=random_seed)
    data_sampled_b =  dataset.drop(data_sampled_a.index).reset_index(drop=True)
    data_sampled_a.reset_index(drop=True, inplace=True)
    return data_sampled_a, data_sampled_b
    # Bar Plot

In [6]:
def bar_plot(data, plot_title: str, x_axis: str, y_axis: str):
    colors = ["#0080ff",] * len(data)
    colors[0] = "#ff8000"
    trace = go.Bar(y=data.values, x=data.index, text=data.values,
                   marker_color=colors)
    layout = go.Layout(autosize=False, height=600,
                       title={"text" : plot_title,
                              "y" : 0.9,
                              "x" : 0.5,
                              "xanchor" : "center",
                              "yanchor" : "top"},
                       xaxis={"title" : x_axis},
                       yaxis={"title" : y_axis},)
    fig = go.Figure(data=trace, layout=layout)
    fig.update_layout(template="simple_white")
    fig.update_traces(textposition="outside",
                      textfont_size=14,
                      marker=dict(line=dict(color="#000000", width=2)))
    fig.update_yaxes(automargin=True)
    iplot(fig)

In [7]:
# Plot Pie Chart
def pie_plot(data, plot_title: str):
    trace = go.Pie(labels=data.index, values=data.values)
    layout = go.Layout(autosize=False,
                       title={"text" : plot_title,
                              "y" : 0.9,
                              "x" : 0.5,
                              "xanchor" : "center",
                              "yanchor" : "top"})
    fig = go.Figure(data=trace, layout=layout)
    fig.update_traces(textfont_size=14,
                      marker=dict(line=dict(color="#000000", width=2)))
    fig.update_yaxes(automargin=True)
    iplot(fig)

In [8]:
# Histogram
def histogram_plot(data, plot_title: str, y_axis: str):
    trace = go.Histogram(x=data)
    layout = go.Layout(autosize=False,
                       title={"text" : plot_title,
                              "y" : 0.9,
                              "x" : 0.5,
                              "xanchor" : "center",
                              "yanchor" : "top"},
                       yaxis={"title" : y_axis})
    fig = go.Figure(data=trace, layout=layout)
    fig.update_traces(marker=dict(line=dict(color="#000000", width=2)))
    fig.update_layout(template="simple_white")
    fig.update_yaxes(automargin=True)
    iplot(fig)

In [9]:
# Particular case: Histogram subplot (1, 2)
def histogram_subplot(dataset_a, dataset_b, feature_a: str,
                      feature_b: str, title: str, title_a: str, title_b: str):
    fig = make_subplots(rows=1, cols=2, subplot_titles=(
        title_a,
        title_b
    )
                        )
    fig.add_trace(go.Histogram(x=dataset_a[feature_a],
                               showlegend=False),
                  row=1, col=1)
    fig.add_trace(go.Histogram(x=dataset_b[feature_b],
                               showlegend=False),
                  row=1, col=2)
    fig.update_layout(template="simple_white")
    fig.update_layout(autosize=False,
                      title={"text" : title,
                             "y" : 0.9,
                             "x" : 0.5,
                             "xanchor" : "center",
                             "yanchor" : "top"},
                      yaxis={"title" : "<i>Frequency</i>"})
    fig.update_traces(marker=dict(line=dict(color="#000000", width=2)))
    fig.update_yaxes(automargin=True)
    iplot(fig)

In [10]:
# Calculate scores with Test/Unseen labeled data
def test_score_report(data_unseen, predict_unseen):
    le = LabelEncoder()
    data_unseen["Label"] = le.fit_transform(data_unseen.Churn.values)
    data_unseen["Label"] = data_unseen["Label"].astype(int)
    accuracy = accuracy_score(data_unseen["Label"], predict_unseen["Label"])
    roc_auc = roc_auc_score(data_unseen["Label"], predict_unseen["Label"])
    precision = precision_score(data_unseen["Label"], predict_unseen["Label"])
    recall = recall_score(data_unseen["Label"], predict_unseen["Label"])
    f1 = f1_score(data_unseen["Label"], predict_unseen["Label"])

    df_unseen = pd.DataFrame({
        "Accuracy" : [accuracy],
        "AUC" : [roc_auc],
        "Recall" : [recall],
        "Precision" : [precision],
        "F1 Score" : [f1]
    })
    return df_unseen

In [11]:
# Confusion Matrix
def conf_mat(data_unseen, predict_unseen):
    unique_label = data_unseen["Label"].unique()
    cmtx = pd.DataFrame(
        confusion_matrix(data_unseen["Label"],
                         predict_unseen["Label"],
                         labels=unique_label),
        index=['{:}'.format(x) for x in unique_label],
        columns=['{:}'.format(x) for x in unique_label]
    )
    ax = sns.heatmap(cmtx, annot=True, fmt="d", cmap="YlGnBu")
    ax.set_ylabel('Predicted')
    ax.set_xlabel('Target');
    ax.set_title("Predict Unseen Confusion Matrix", size=14);

In [12]:
dataset  = pd.read_csv('./20 Churn Prediction/Telco-Customer-Churn.csv')

In [13]:
dataset.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [14]:
#Check for duplicated samples
dataset[dataset.duplicated()]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


In [15]:
dataset_info(dataset, "customers")

Dataset Name: customers         | Number of Samples: 7043         | Number of Columns: 21
Column             Data Type
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object
No Missing Data on this Dataset!
Memory Usage: 1.183 MB


In [16]:
#Convert total charges to numerical
dataset["TotalCharges"] = pd.to_numeric(dataset["TotalCharges"], errors="coerce")
print(f"The Feature TotalCharges is type {dataset.TotalCharges.dtype} now!")

The Feature TotalCharges is type float64 now!


In [17]:
data, data_unseen = data_sampling(dataset, 0.9, RANDOM_SEED)

In [18]:
#filled Null values in TotalCharges column
total_charges_mean =data['TotalCharges'].mean()
data['TotalCharges'].replace(np.nan,total_charges_mean, inplace= True)

In [19]:
data_unseen['TotalCharges'].replace(np.nan,total_charges_mean, inplace= True)

In [20]:
exp01 = setup(data=data, target="Churn", ignore_features=["customerID"])

In [21]:
# compare models
top5 = compare_models(n_select = 5,sort="F1")

Processing:   0%|          | 0/69 [00:00<?, ?it/s]

In [37]:
pull(top5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Bagging Classifier,0.8324,0.8706,0.8324,0.824,0.826,0.5065,0.5111


In [22]:
# tune models
tuned_top5 = [tune_model(i) for i in top5]

Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [38]:
pull(tuned_top5)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8131,0.0,0.8131,0.8039,0.8035,0.4841,0.4934
1,0.8018,0.0,0.8018,0.7934,0.7957,0.4664,0.4699
2,0.7815,0.0,0.7815,0.7661,0.7663,0.3787,0.3911
3,0.8018,0.0,0.8018,0.797,0.799,0.481,0.4819
4,0.8153,0.0,0.8153,0.8067,0.8075,0.4943,0.5008
5,0.8086,0.0,0.8086,0.7988,0.7994,0.4713,0.4791
6,0.795,0.0,0.795,0.7872,0.7898,0.4529,0.4553
7,0.7991,0.0,0.7991,0.7956,0.7971,0.479,0.4794
8,0.8104,0.0,0.8104,0.8026,0.8045,0.4903,0.494
9,0.8014,0.0,0.8014,0.7936,0.7959,0.469,0.4718


In [23]:
# ensemble models
bagged_top5 = [ensemble_model(i) for i in tuned_top5]

Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Processing:   0%|          | 0/6 [00:00<?, ?it/s]

In [39]:
# blend models
blender = blend_models(estimator_list = top5)

Processing:   0%|          | 0/6 [00:00<?, ?it/s]

In [40]:
blender

In [25]:
# stack models
stacker = stack_models(estimator_list = top5)

Processing:   0%|          | 0/6 [00:00<?, ?it/s]

In [26]:
# automl
best = automl(optimize = 'AUC')
print(best)

BaggingClassifier(base_estimator='deprecated', bootstrap=True,
                  bootstrap_features=False,
                  estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.2,
                                                       loss='log_loss',
                                                       max_depth=1,
                                                       max_features='log2',
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.005,
                                                       min_samples_leaf=2,
                                                       min_samples_split=7,
                                                       min_weight_fraction_

In [28]:
pull(best)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8153,0.8443,0.8153,0.8067,0.8069,0.4947,0.5022
1,0.7928,0.8497,0.7928,0.7844,0.7871,0.4453,0.448
2,0.795,0.8435,0.795,0.7832,0.7844,0.4306,0.439
3,0.7995,0.8298,0.7995,0.7943,0.7964,0.4737,0.4747
4,0.8176,0.8707,0.8176,0.809,0.8095,0.499,0.5061
5,0.8086,0.8411,0.8086,0.7991,0.8001,0.4743,0.481
6,0.7928,0.8334,0.7928,0.7852,0.7878,0.4484,0.4505
7,0.7856,0.8332,0.7856,0.7828,0.7841,0.4469,0.4471
8,0.8036,0.8411,0.8036,0.797,0.7992,0.4794,0.4814
9,0.8014,0.8462,0.8014,0.7936,0.7959,0.469,0.4718


In [29]:
final_model = finalize_model(best)

In [31]:
predict_unseen = predict_model(final_model, data=data_unseen)

In [32]:
precision_recall_fscore_support(predict_unseen[['Churn']],predict_unseen[['prediction_label']],average='macro')

(0.775890637945319, 0.7366737739872069, 0.7524790236460717, None)