# TELCO CUSTOMER CHURN XGBoost + CatBoost / SHAP + LIME

**For EDA you can visit this notebook: https://www.kaggle.com/code/rolmez/telco-customer-churn-eda**

## Import Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import joblib

import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score,classification_report, recall_score,confusion_matrix, roc_auc_score, precision_score, f1_score, roc_curve, auc
from sklearn.preprocessing import OrdinalEncoder

from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier, plot_importance

import shap
import lime
import lime.lime_tabular

## Data Organization

In [5]:
from explore import wrangle

df = wrangle('C:/Users/solace.dark/Documents/Jupyter Notebook/Practical Projects/Customer Churn Prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   object 
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [6]:
categorical_columns = df.select_dtypes(include=['object', 'category'])
for feature in categorical_columns:
    print(f"Column: {feature} -> {df[feature].unique()}")
    print(f"Unique Number: {df[feature].nunique()}")
    print(f"{df[feature].value_counts()} \n")

Column: gender -> ['Female' 'Male']
Unique Number: 2
gender
Male      3555
Female    3488
Name: count, dtype: int64 

Column: SeniorCitizen -> ['No' 'Yes']
Unique Number: 2
SeniorCitizen
No     5901
Yes    1142
Name: count, dtype: int64 

Column: Partner -> ['Yes' 'No']
Unique Number: 2
Partner
No     3641
Yes    3402
Name: count, dtype: int64 

Column: Dependents -> ['No' 'Yes']
Unique Number: 2
Dependents
No     4933
Yes    2110
Name: count, dtype: int64 

Column: PhoneService -> ['No' 'Yes']
Unique Number: 2
PhoneService
Yes    6361
No      682
Name: count, dtype: int64 

Column: MultipleLines -> ['No' 'Yes']
Unique Number: 2
MultipleLines
No     4072
Yes    2971
Name: count, dtype: int64 

Column: InternetService -> ['DSL' 'Fiber optic' 'No']
Unique Number: 3
InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: count, dtype: int64 

Column: OnlineSecurity -> ['No' 'Yes']
Unique Number: 2
OnlineSecurity
No     5024
Yes    2019
Name: count, dtype: int64 


In [None]:
df.isnull().sum()

* *We need to drop the custoemerID column that each customer has unique, but I will leave it this way because I will use this value in my future applications. And make the TotalCharges variable numeric. We know from the EDA notebook that the TotalCharges variable has 11 empty values. Let's statistically fill these values using tenure and MonthlyCharges variables.*
* *In MultipleLines, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV and StreamingMovies variables 'No' and 'No internet-phone service' are used repeatedly although they mean the same thing. These need to be merged during the model development phase.*

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['tenure'] * df['MonthlyCharges'], inplace=True)
df['SeniorCitizen'] = df['SeniorCitizen'].astype(object)

df['MultipleLines'] = df['MultipleLines'].replace('No phone service','No')
columns_to_replace = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
for column in columns_to_replace:
    df[column] = df[column].replace('No internet service', 'No')
    
# Changing categorical variables to numeric:
df['Churn'] = df['Churn'].replace({'No': 0, 'Yes': 1})

In [7]:
# Save the edited data
df.to_parquet('churn_data_regulated.parquet')

## Preparation of Data for the CatBoost Model

Telco dataset is an unbalanced data set. We need to maintain the balance ratio when splitting this dataset into training and test sets. Let's use StratifiedShuffleSplit to maintain this ratio.

In [8]:
# Create the StratifiedShuffleSplit object
strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=64)

train_index, test_index = next(strat_split.split(df, df["Churn"]))

# Create train and test sets
strat_train_set = df.loc[train_index]
strat_test_set = df.loc[test_index]

# Proportion of the target variable in the original data set
print('Target Labels Ratio in Original Dataset\n')
print(df["Churn"].value_counts(normalize=True).sort_index())

# Proportion of the target variable in the test set
print('\nTarget Labels Ratio in Test Dataset\n')
print(strat_test_set["Churn"].value_counts(normalize=True).sort_index())

X_train = strat_train_set.drop("Churn", axis=1)
y_train = strat_train_set["Churn"].copy()

X_test = strat_test_set.drop("Churn", axis=1)
y_test = strat_test_set["Churn"].copy()

Target Labels Ratio in Original Dataset

Churn
No     0.73463
Yes    0.26537
Name: proportion, dtype: float64

Target Labels Ratio in Test Dataset

Churn
No     0.734564
Yes    0.265436
Name: proportion, dtype: float64


In [None]:
# Save the datasets
joblib.dump(X_train, 'X_train.pkl')
joblib.dump(y_train, 'y_train.pkl')
joblib.dump(X_test, 'X_test.pkl')
joblib.dump(y_test, 'y_test.pkl')

## CatBoost Model

In [9]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

cat_model = CatBoostClassifier(verbose=False, random_state=0, scale_pos_weight=3)

cat_model.fit(X_train, y_train, cat_features=categorical_columns, eval_set=(X_test, y_test))

y_pred = cat_model.predict(X_test)

accuracy, recall, roc_auc, precision = [round(metric(y_test, y_pred), 4) for metric in [accuracy_score, recall_score, roc_auc_score, precision_score]]

model_names = ['CatBoost_Model']

result = pd.DataFrame({'Accuracy':accuracy,
                       'Recall':recall, 
                       'Roc_Auc':roc_auc, 
                       'Precision':precision}, index=model_names)

result

ValueError: 'Churn' is not in list

In [None]:
cat_model.save_model('cat_model.cbm')

In [None]:
pool = Pool(X_train, y_train, cat_features=categorical_columns)

feature_importance = pd.DataFrame({'feature_importance': cat_model.get_feature_importance(pool), 
                      'feature_names': X_train.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)

feature_importance

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=feature_importance['feature_importance'], y=feature_importance['feature_names'], palette = 'rocket')
plt.show()

In [None]:
cat_confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(cat_confusion_matrix, annot=True, fmt="d")

plt.xlabel("Predicted Label", fontsize= 12)
plt.ylabel("True Label", fontsize= 12)

plt.show()

In [None]:
print(metrics.classification_report(y_test, y_pred, labels = [0, 1]))

# SHAP

In [None]:
explainercat = shap.TreeExplainer(cat_model)
shap_values_cat_train = explainercat.shap_values(X_train)
shap_values_cat_test = explainercat.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values_cat_train, X_train, plot_type="bar",plot_size=(12,12))

In [None]:
# summarize the effects of all the features

fig = plt.subplots(figsize=(6,6),dpi=200)

ax = shap.summary_plot(shap_values_cat_train, X_train,plot_type="dot")

In [None]:
# Contract

fig, ax= plt.subplots(figsize=(6,6),dpi=100)
shp_plt = shap.dependence_plot("Contract", shap_values_cat_test, X_test,ax=ax,interaction_index=None)

In [None]:
# Internet Service

fig, ax= plt.subplots(figsize=(6,6),dpi=100)

shap.dependence_plot("InternetService", shap_values_cat_test, X_test,ax=ax, interaction_index=None)

In [None]:
# MonthlyCharges

fig, ax1= plt.subplots(figsize=(6,6),dpi=150)

shp_plt = shap.dependence_plot("MonthlyCharges", shap_values_cat_test,X_test,ax=ax1, interaction_index=None)

In [None]:
# Tenure - Contract Churn

fig, ax3 = plt.subplots(figsize=(6,6),dpi=200)
shp_plt = shap.dependence_plot("tenure", shap_values_cat_test, X_test, interaction_index="Contract", ax=ax3, cmap=plt.get_cmap('autumn'))

In [None]:
fig = plt.subplots(figsize=(6,6),dpi=200)
ax_1= shap.decision_plot(explainercat.expected_value, shap_values_cat_test[:50], X_test.iloc[:50],a uto_size_plot=False, link= "logit")

In [None]:
# Customer Specific Cases: Customer 13 and Customer 64

# X_test.iloc[13]

fig = plt.subplots(figsize=(6,6),dpi=200)
ax_2= shap.decision_plot(explainercat.expected_value, shap_values_cat_test[13], X_test.iloc[[13]], link= "logit")

In [None]:
# Let's see our model prediction on the customer 15:

print(f'Customer 13: Actual value for the Customer Churn : {y_test.iloc[13]}')
print(f"Customer 13: CatBoost Model's prediction for the Customer Churn : {y_pred[13]}")

In [None]:
shap_values_cat_test[64]

In [None]:
# X_test.iloc[64]

fig = plt.subplots(figsize=(6,6),dpi=200)
ax_2= shap.decision_plot(explainercat.expected_value, shap_values_cat_test[64], X_test.iloc[[64]], link= "logit")

In [None]:
print(f'Customer 64: Actual value for the Customer Churn : {y_test.iloc[64]}')
print(f"Customer 64: CatBoost Model's prediction for the Customer Churn : {y_pred[64]}")

In [None]:
# SHAP FORCE PLOT

shap.initjs()
shap.force_plot(explainercat.expected_value, shap_values_cat_test[:75], X_test.iloc[:75],link= "logit")

In [None]:
shap.force_plot(
    explainercat.expected_value, 
    shap_values_cat_test[13], 
    X_test.iloc[[13]],
    matplotlib=True
)

In [None]:
## Waterfall customer 13

fig = plt.subplots(figsize=(6,6),dpi=150)
ax_3= shap.plots._waterfall.waterfall_legacy(explainercat.expected_value, shap_values_cat_test[13], feature_names = X_test.columns,max_display = 20)b

# XGBOOST

In [None]:
df = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.drop('customerID', axis=1, inplace=True)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['tenure'] * df['MonthlyCharges'], inplace=True)
df['SeniorCitizen'] = df['SeniorCitizen'].astype(object)

df['MultipleLines'] = df['MultipleLines'].replace('No phone service','No')
columns_to_replace = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
for column in columns_to_replace:
    df[column] = df[column].replace('No internet service', 'No')
    
# Changing categorical variables to numeric:
df['Churn'] = df['Churn'].replace({'No': 0, 'Yes': 1})

In [None]:
# OrdinalEncoder
encoder = OrdinalEncoder()
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

X = df.drop('Churn', axis=1).copy()
y = df['Churn'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical columns in Train data
X_train_encoded = encoder.fit_transform(X_train[categorical_columns])

# Encode categorical columns in test data (no fit, only transform)
X_test_encoded = encoder.transform(X_test[categorical_columns])

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=categorical_columns)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=categorical_columns)

xgb = XGBClassifier(random_state=0,scale_pos_weight=3)

xgb.fit(X_train_encoded_df, y_train)
y_pred = xgb.predict(X_test_encoded_df)

accuracy, recall, roc_auc, precision = [round(metric(y_test, y_pred), 4) for metric in [accuracy_score, recall_score, roc_auc_score, precision_score]]

model_names = ['XGBoost_adjusted_weight_3']
result_df3 = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
result_df3


In [None]:
plot_importance(xgb)
plt.show()

# LIME

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train_encoded_df.values, feature_names=X_train_encoded_df.columns.values.tolist(),
                                                  class_names=['Churn'], verbose=True, mode='classification')

# Choose the jth instance and use it to predict the results for that selection
j = 13
exp = explainer.explain_instance(X_train_encoded_df.values[j], xgb.predict_proba, num_features=5)

In [None]:
exp.show_in_notebook(show_table=True)