In [5]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/ 
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle datasets download -d shilongzhuang/telecom-customer-churn-by-maven-analytics

In [3]:
!pip install kaggle

In [7]:
!unzip telecom-customer-churn-by-maven-analytics.zip

In [8]:
!pip install ml_insights -q

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ml_insights as mli
%matplotlib inline
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, f1_score
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier

In [3]:
!pip install lightgbm

In [5]:
data = pd.read_csv('telecom_customer_churn.csv')
data.head()

In [6]:
data.info()


In [7]:
data = data.iloc[:, [1, 2, 3, 4, 9, 10, 11, 12, 15, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]]


In [8]:
data = data[data['Customer Status'] != 'Joined'].reset_index(drop=True)


In [9]:
data['Customer Status'].value_counts()


In [10]:
data['Customer Status'] = data['Customer Status'].apply(lambda x: 1 if x == 'Churned' else 0)


In [11]:
data['Customer Status'].value_counts()


In [12]:
data.head()


In [13]:
# Extract categoricals and their indices
cat_features = data.select_dtypes(exclude=np.number).columns.to_list()
cat_idx = [data.columns.get_loc(col) for col in cat_features]

# Convert cat_features to pd.Categorical dtype
for col in cat_features:
    data[col] = pd.Categorical(data[col])

In [14]:
data.info()


In [15]:
train_perc = 0.4
val_perc = 0.3
test_perc = 0.3
rs = 1234

X_train_val, X_test, y_train_val, y_test = train_test_split(data.iloc[:, :-1], data.iloc[:, -1], test_size=test_perc,
                                                            random_state=rs, stratify=data.iloc[:, -1])

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=val_perc, random_state=rs,
                                                  stratify=y_train_val)

In [16]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

In [17]:
y_train.value_counts()


In [18]:
y_test.value_counts()

In [19]:
%%time

lgbm_clf = lgbm.LGBMClassifier(
    objective="binary",
    random_state=rs,
    n_estimators=10,
    boosting="gbdt",  # default histogram binning of LGBM
    class_weight="balanced",
)

lgbm_clf.fit(
    X_train,
    y_train,
    categorical_feature=cat_idx

)

preds_uncalibrated_val = lgbm_clf.predict_proba(X_val)[:, 1]
print(f"LightGBM logloss on the evaluation set: {log_loss(y_val, preds_uncalibrated_val):.5f}")
print(f"LightGBM ROC-AUC on the evaluation set: {roc_auc_score(y_val, preds_uncalibrated_val):.5f}")
print(f"LightGBM F1 on the evaluation set: {f1_score(y_val, lgbm_clf.predict(X_val)):.5f}")

In [20]:
preds_uncalibrated_test = lgbm_clf.predict_proba(X_test)[:, 1]
print(f"LightGBM logloss on the test set: {log_loss(y_test, preds_uncalibrated_test):.5f}")
print(f"LightGBM ROC-AUC on the test set: {roc_auc_score(y_test, preds_uncalibrated_test):.5f}")
print(f"LightGBM F1 on the test set: {f1_score(y_test, lgbm_clf.predict(X_test)):.5f}")
print()
plt.figure(figsize=(15, 5))
rd = mli.plot_reliability_diagram(y_test, preds_uncalibrated_test, show_histogram=True)

In [30]:
y_test

In [31]:
preds_uncalibrated_test

In [33]:
print(y_test)

In [34]:
type(y_test)

In [35]:
preds_uncalibrated_test

In [36]:
preds_uncalibrated_test.shape

In [21]:
from sklearn.calibration import calibration_curve


def plot_calibration_curve(name, fig_index, probs):
    """Plot calibration curve for est w/o and with calibration. """

    fig = plt.figure(fig_index, figsize=(10, 10))
    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))

    ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")

    frac_of_pos, mean_pred_value = calibration_curve(y_test, probs, n_bins=20)

    ax1.plot(mean_pred_value, frac_of_pos, "s-", label=f'{name}')
    ax1.set_ylabel("Fraction of positives")
    ax1.set_ylim([-0.05, 1.05])
    ax1.legend(loc="lower right")
    ax1.set_title(f'Calibration plot ({name})')

    ax2.hist(probs, range=(0, 1), bins=20, label=name, histtype="step", lw=2)
    ax2.set_xlabel("Mean predicted value")
    ax2.set_ylabel("Count")


plot_calibration_curve("LightGBM Uncalibrated", 1, preds_uncalibrated_test)

In [22]:
# Fit Platt scaling (logistic calibration)
lr = LogisticRegression(C=99999999999, solver='lbfgs')
lr.fit(preds_uncalibrated_val.reshape(-1, 1), y_val)

In [23]:
testset_platt_probs = lr.predict_proba(preds_uncalibrated_test.reshape(-1, 1))[:, 1]
print(f"LightGBM logloss on the test set: {log_loss(y_test, testset_platt_probs):.5f}")
print(f"LightGBM ROC-AUC on the test set: {roc_auc_score(y_test, testset_platt_probs):.5f}")
print()
plt.figure(figsize=(15, 5))
mli.plot_reliability_diagram(y_test, testset_platt_probs, show_histogram=True, scaling='logit');
plt.title('Reliability Diagram on Test Data\n after Platt Calibration');