# **TELCO CHURN ANALYSIS**

A telecommunications company aims to analyze customer behavior to identify those with a high likelihood of long-term retention. However, with the rapidly evolving telecommunications landscape, customers are constantly seeking the best services to meet their daily needs. To ensure sustained growth and competitiveness, the company must analyze potential customers before onboarding them, allowing for more targeted strategies that promote long-term loyalty.

**Main Objectives:**
1. Provide characteristics on churning customers
2. Deliver data to further increase efficiency on marketing costs

In this case, leveraging a classification model would be the best option, where the output would either be a 'Yes' or 'No' option. Which meant the use of a **`confussion matrix`** to further enhance the scoring of the model. However there a few drawbacks in using a confussion matrix, which are:
- **False Positive (FP)**: predicted as churn, actual stayed
    1. Due to the prediction of churning, customers who are predicted to churn are given special benefits in order to prevent them from churning.
    2. Customers are given special promotions (such as discount, special prices for subscription, bundling, etc.), however the customer stayed, therefore those promotions are considered ineffective.
- **False Negative (FN)**: predicted as stayed, actual churn
    1. Due to the prediction of not churning, customers who are predicted to not churn at all are not given any special treatment.
    2. In order to maintain and increase company revenue, customers who churn needed to be replaced as soon as possible whereas the fee needed to obtain new customers are much higher than maintaning existing customers.

From the consequences above, it is concluded that the loss suffered will be greater whenever the prediction predicted the customer as a non churning customer, but the actual customer is a churning customer. Therefore the value of **`False Negatives`** are needed to be pressed down, which in this case will be using **`f2 score`**.

---

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

#visualization
import seaborn as sns
import matplotlib.pyplot as plt

#map
import folium
from folium.plugins import MarkerCluster
from IPython.display import display

# train test split
from sklearn.model_selection import train_test_split

# encoding
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders import OrdinalEncoder, BinaryEncoder

# scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler 

# column transformer & pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline

# cross validation
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV

# imbalance
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss

# algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# ensemble similar type
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier              # Bagging
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier         # Boosting

from xgboost.sklearn import XGBClassifier

# metric
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer, fbeta_score

# model results
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

# export model
import pickle
import joblib

import warnings
warnings.filterwarnings('ignore')

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
WITH MAX_DATE AS (
    SELECT 
        MAX(TO_DATE(_FIVETRAN_SYNCED)) AS max_date
    FROM TELCO.RAW.TB_R_CHURN
)
SELECT 
    a.CUSTOMER_ID
    ,a.GENDER
    ,a.BIRTH_DATE
    ,a.SENIOR_CITIZEN
    ,a.PARTNER
    ,a.DEPENDENTS
    ,a.COUNTRY_CODE
    ,a.STATE
    ,a.CITY
    ,a.ZIP_CODE
    ,a.LATITUDE
    ,a.LONGITUDE
    ,a.PHONE_SERVICE
    ,a.MULTIPLE_LINES
    ,a.INTERNET_SERVICE
    ,a.ONLINE_SECURITY
    ,a.ONLINE_BACKUP
    ,a.DEVICE_PROTECTION
    ,a.TECH_SUPPORT
    ,a.STREAMING_TV
    ,a.STREAMING_MOVIES
    ,a.CONTRACT
    ,a.PAPERLESS_BILLING
    ,a.PAYMENT_METHOD
    ,a.DATE_JOINED
    ,a.QUARTER
    ,DATEDIFF('month', a.DATE_JOINED, CURRENT_DATE) AS TENURE_MONTHS
    ,a.MONTHLY_CHARGES
    ,(DATEDIFF('month', a.DATE_JOINED, CURRENT_DATE) * a.MONTHLY_CHARGES) AS TOTAL_CHARGES
    ,CASE 
        WHEN DATEDIFF('year', a.DATE_JOINED, CURRENT_DATE) <= 1 THEN (DATEDIFF('month', a.DATE_JOINED, CURRENT_DATE) * a.MONTHLY_CHARGES) * 1
        ELSE DATEDIFF('year', a.DATE_JOINED, CURRENT_DATE) * (DATEDIFF('month', a.DATE_JOINED, CURRENT_DATE) * a.MONTHLY_CHARGES)
    END AS CLTV
    ,a.CHURN_LABEL
    ,a.CHURN_REASON
FROM 
    TELCO.RAW.TB_R_CHURN a
LEFT JOIN MAX_DATE b ON TO_DATE(a._FIVETRAN_SYNCED) = b.max_date
WHERE b.max_date IS NOT NULL

In [None]:
df = churn_data.to_pandas()

cat_columns = df.describe(include= 'object').columns
num_columns = df.describe().columns
plt.figure(figsize=(20,20))
count = 1

for _ in df.columns:
    if _ in ['CUSTOMER_ID', 'BIRTH_DATE', 'COUNTRY_CODE', 'ZIP_CODE', 'LATITUDE', 'LONGITUDE', 'DATE_JOINED', 'CHURN_REASON']:
        continue

    elif _ in cat_columns:
        plt.subplot(6, 5, count)
        sns.histplot(data= df, x= _, hue= 'CHURN_LABEL')
        plt.xlabel(None)

        plt.title(_)
        plt.xticks(rotation= 90, size= 8)
        plt.tight_layout()
        count = count + 1

    else:
        plt.subplot(6, 5, count)
        sns.violinplot(data= df, x= 'CHURN_LABEL', y= _)
        plt.xlabel(None)

        plt.title(_)
        plt.xticks(rotation= 90, size= 8)
        plt.tight_layout()
        count = count + 1
;

In [None]:
df['BIRTH_DATE'] = pd.to_datetime(df['BIRTH_DATE'], errors='coerce')

df['BIRTH_YEAR'] = df['BIRTH_DATE'].dt.year
df['BIRTH_MONTH'] = df['BIRTH_DATE'].dt.month
df['BIRTH_DAY'] = df['BIRTH_DATE'].dt.day

df = df.drop(columns=['CUSTOMER_ID', 'DATE_JOINED', 'BIRTH_DATE', 'ZIP_CODE', 'CHURN_REASON'])
df.head()

In [None]:
df['CHURN_LABEL'] = np.where(df['CHURN_LABEL'] == 'Yes', 1, 0)

X = df.drop(columns= 'CHURN_LABEL')
y = df['CHURN_LABEL']

In [None]:
(X_train, X_test, y_train, y_test) = train_test_split(
    X,
    y,
    stratify= y,
    random_state= 0,
    test_size= 0.3
)

In [None]:
transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(drop= 'first'), [
        'GENDER', 'SENIOR_CITIZEN', 'PARTNER', 'DEPENDENTS', 'COUNTRY_CODE', 'PHONE_SERVICE', 'MULTIPLE_LINES', 'INTERNET_SERVICE', 'ONLINE_SECURITY', 'ONLINE_BACKUP', 'DEVICE_PROTECTION', 'TECH_SUPPORT', 'STREAMING_TV', 'STREAMING_MOVIES', 'CONTRACT', 'PAPERLESS_BILLING', 'PAYMENT_METHOD'
    ]),
    ('binary', BinaryEncoder(), [
        'STATE', 'CITY', 'QUARTER'
    ]),
    ('robust', RobustScaler(), [
        'LATITUDE', 'LONGITUDE', 'TENURE_MONTHS', 'MONTHLY_CHARGES', 'TOTAL_CHARGES', 'CLTV', 'BIRTH_YEAR', 'BIRTH_MONTH', 'BIRTH_DAY'
    ])
], remainder= 'passthrough')

transformer.fit(X_train)

In [None]:
logreg = LogisticRegression(random_state=0, max_iter= 1000)
knn = KNeighborsClassifier(n_neighbors=5)
tree = DecisionTreeClassifier(max_depth=5, random_state=0)

randforest = RandomForestClassifier(random_state=0)
bagging = BaggingClassifier(random_state=0, estimator= KNeighborsClassifier(n_neighbors= 3))

adaboost = AdaBoostClassifier(random_state=0)
gboost = GradientBoostingClassifier(random_state= 0)
xgboost = XGBClassifier(random_state= 0)

resampling = RandomOverSampler(random_state=0)
f2 = make_scorer(fbeta_score, beta=2)
list_algo = [logreg, knn, tree, randforest, bagging, adaboost, gboost, xgboost]

list_mean_f2 = []
list_std_f2 = []

for _ in list_algo:
    pipe_model = Pipeline([
    ('resampling', resampling),
    ('preprocessing', transformer),
    ('modeling', _)
    ])

    cv_score_best_algo = cross_val_score(
        estimator= pipe_model,
        X= X_train,
        y= y_train,
        cv= 5,
        scoring= f2
    ).round(2)
    
    list_mean_f2.append(cv_score_best_algo.mean().round(3))
    list_std_f2.append(cv_score_best_algo.std().round(3))

df_cv = pd.DataFrame()
df_cv['algo'] = list_algo
df_cv['mean_f2'] = list_mean_f2
df_cv['std_f2'] = list_std_f2

df_cv.sort_values(by= ['mean_f2', 'std_f2'], ascending= [False, True])

In [None]:
list_resampling = [RandomOverSampler(random_state=0), RandomUnderSampler(random_state=0), SMOTE(random_state=0), NearMiss(), None]

model_pipe_best = Pipeline([
    ('resampling', resampling),
    ('prep', transformer),
    ('modeling', tree)
])

hyperparam_tree = {
    'resampling' : list_resampling,

    'prep__robust' : [RobustScaler(), MinMaxScaler(), StandardScaler()],

    'modeling__max_depth': [3, 5, 7, 10],                    # tree depth
    'modeling__min_samples_split': [5, 10, 20, 30],          # minimum samples to split a node
    'modeling__min_samples_leaf': [1, 2, 4, 6],              # minimum samples at leaf node
}

gridsearch_tree = GridSearchCV(
    estimator= model_pipe_best,
    param_grid= hyperparam_tree,
    cv= 5,
    scoring= f2,
    n_jobs= 1
)

gridsearch_tree.fit(X_train, y_train)
gridsearch_tree.best_score_

In [None]:
pipe_model_tree = Pipeline([
        ('preprocessing', transformer),
        ('modeling', tree)
    ])

# Model training
pipe_model_tree.fit(X_train, y_train)

# Model predicting test set  
y_pred_untuned = pipe_model_tree.predict(X_test)

best_model = gridsearch_tree.best_estimator_

# Model training
best_model.fit(X_train, y_train)

# Model predicting test set  
y_pred_tuned = best_model.predict(X_test)

print(f'''{fbeta_score(y_test, y_pred_untuned, beta= 2)} - Before Tuning.
{fbeta_score(y_test, y_pred_tuned, beta= 2)} - After Tuning.''')

In [None]:
plt.figure(figsize=(20,4))

plt.subplot(1,4,2)
sns.heatmap(data= confusion_matrix(y_test, y_pred_untuned), annot= True, fmt= 'g')
plt.title('Before Tuning')
plt.ylabel('Actual')
plt.xlabel('Predict')

plt.subplot(1,4,3)
sns.heatmap(data= confusion_matrix(y_test, y_pred_tuned), annot= True, fmt= 'g')
plt.title('After Tuning')
plt.ylabel('Actual')
plt.xlabel('Predict')

plt.tight_layout()
;

In [None]:
best_model.fit(X, y)
joblib.dump(best_model, "/tmp/churn_status_best_model.sav")

session.file.put("/tmp/churn_status_best_model.sav", "@TELCO.RAW.ML_MODEL", auto_compress=False, overwrite=True)

In [None]:
CREATE OR REPLACE PROCEDURE TELCO.DATAMART.SP_CUSTOMER_STATUS_PREDICTION()
RETURNS STRING
LANGUAGE PYTHON
RUNTIME_VERSION = '3.9'
PACKAGES = ('pandas', 'scikit-learn', 'joblib', 'snowflake-snowpark-python', 'imbalanced-learn', 'category_encoders')
HANDLER = 'main'
AS
$$
import pandas as pd
import joblib
from snowflake.snowpark import Session

def main(session: Session) -> str:
    query = """
        WITH MAX_DATE AS (
            SELECT 
                MAX(TO_DATE(_FIVETRAN_SYNCED)) AS max_date
            FROM TELCO.RAW.TB_R_CHURN
        )
        SELECT 
            a.CUSTOMER_ID
            ,a.GENDER
            ,a.BIRTH_DATE
            ,a.SENIOR_CITIZEN
            ,a.PARTNER
            ,a.DEPENDENTS
            ,a.COUNTRY_CODE
            ,a.STATE
            ,a.CITY
            ,a.ZIP_CODE
            ,a.LATITUDE
            ,a.LONGITUDE
            ,a.PHONE_SERVICE
            ,a.MULTIPLE_LINES
            ,a.INTERNET_SERVICE
            ,a.ONLINE_SECURITY
            ,a.ONLINE_BACKUP
            ,a.DEVICE_PROTECTION
            ,a.TECH_SUPPORT
            ,a.STREAMING_TV
            ,a.STREAMING_MOVIES
            ,a.CONTRACT
            ,a.PAPERLESS_BILLING
            ,a.PAYMENT_METHOD
            ,a.DATE_JOINED
            ,a.QUARTER
            ,a.TENURE_MONTHS
            ,a.MONTHLY_CHARGES
            ,a.TOTAL_CHARGES
            ,a.CLTV
            ,a.CHURN_REASON
        FROM 
            TELCO.RAW.TB_R_CHURN a
        LEFT JOIN MAX_DATE b ON TO_DATE(a._FIVETRAN_SYNCED) = b.max_date
        WHERE b.max_date IS NOT NULL
    """
    df_main = session.sql(query).to_pandas()
    df_new = df_main.copy()
    df_new['BIRTH_YEAR'] = df_new['BIRTH_DATE'].dt.year
    df_new['BIRTH_MONTH'] = df_new['BIRTH_DATE'].dt.month
    df_new['BIRTH_DAY'] = df_new['BIRTH_DATE'].dt.day
    df_new = df_new.drop(columns=['CUSTOMER_ID', 'DATE_JOINED', 'BIRTH_DATE', 'ZIP_CODE', 'CHURN_REASON'])

    # 2️⃣ Load model from stage
    model_path = "@TELCO.RAW.ML_MODEL/churn_status_best_model.sav"
    session.file.get(model_path, "/tmp")  # downloads to /tmp inside Snowflake runtime
    model = joblib.load("/tmp/churn_status_best_model.sav")

    # 3️⃣ Predict
    df_new["PREDICTED_STATUS"] = model.predict(df_new)
    df_new["PREDICTED_STATUS"] = df_new["PREDICTED_STATUS"].replace({1: "Yes", 0: "No"})
    
    df_new["PREDICTION_DATE"] = session.sql("SELECT CURRENT_DATE()").collect()[0][0]
    df_export = pd.DataFrame()
    df_export['CUSTOMER_ID'] = df_main['CUSTOMER_ID']
    df_export['PREDICTION_RESULTS'] = df_new['PREDICTED_STATUS']
    df_export['PREDICTION_DATE'] = df_new['PREDICTION_DATE']
    

    # 4️⃣ Save results back to Snowflake
    session.create_dataframe(df_export).write.mode("append").save_as_table("TELCO.DATAMART.TB_R_CHURN_PREDICTION")

    return f"Predictions completed for {len(df_new)} records."
$$;

In [None]:
CREATE OR REPLACE PROCEDURE TELCO.DATAMART.SP_CHURN_PREDICTION()
RETURNS STRING
LANGUAGE SQL
AS
$$
BEGIN
    CALL TELCO.DATAMART.SP_CUSTOMER_STATUS_PREDICTION();

    CREATE OR REPLACE TABLE TELCO.DATAMART.TB_F_CHURN_PREDICTION AS
    SELECT DISTINCT
        a.CUSTOMER_ID,
        a.PREDICTION_RESULTS,
        CASE 
            WHEN a.PREDICTION_RESULTS = 'No' THEN NULL
            ELSE SNOWFLAKE.CORTEX.COMPLETE(
                'snowflake-arctic',
                CONCAT(
                    'You are a churn prediction analyst. ',
                    'Given this customer profile, explain briefly (in 5–10 words) why this customer might churn: ',
                    'Gender: ', b.GENDER, ', ',
                    'Senior Citizen: ', b.SENIOR_CITIZEN, ', ',
                    'Partner: ', b.PARTNER, ', ',
                    'Dependents: ', b.DEPENDENTS, ', ',
                    'Country: ', b.COUNTRY_CODE, ', ',
                    'State: ', b.STATE, ', ',
                    'City: ', b.CITY, ', ',
                    'Latitude: ', b.LATITUDE, ', Longitude: ', b.LONGITUDE, ', ',
                    'Phone Service: ', c.PHONE_SERVICE, ', ',
                    'Multiple Lines: ', c.MULTIPLE_LINES, ', ',
                    'Internet Service: ', c.INTERNET_SERVICE, ', ',
                    'Online Security: ', c.ONLINE_SECURITY, ', ',
                    'Online Backup: ', c.ONLINE_BACKUP, ', ',
                    'Device Protection: ', c.DEVICE_PROTECTION, ', ',
                    'Tech Support: ', c.TECH_SUPPORT, ', ',
                    'Streaming TV: ', c.STREAMING_TV, ', ',
                    'Streaming Movies: ', c.STREAMING_MOVIES, ', ',
                    'Contract: ', c.CONTRACT_TYPE, ', ',
                    'Paperless Billing: ', c.PAPERLESS_BILLING, ', ',
                    'Payment Method: ', c.PAYMENT_METHOD, ', ',
                    'Date Joined: ', d.DATE_JOINED, ', ',
                    'Quarter: ', CONCAT('Q', f.quarter, '-', f.year), ', ',
                    'Tenure (months): ', d.TENURE_MONTHS, ', ',
                    'Monthly Charges: ', d.MONTHLY_CHARGES, ', ',
                    'Total Charges: ', d.TOTAL_CHARGES, ', ',
                    'CLTV: ', d.CLTV, ', ',
                    'Birth Date: ', b.BIRTH_DATE, ', ',
                    'Predicted churn status: ', a.PREDICTION_RESULTS, '.'
                )
            )
        END AS CHURN_REASON,
        a.PREDICTION_DATE
    FROM TELCO.DATAMART.TB_R_CHURN_PREDICTION a
    LEFT JOIN TELCO.DATAMART.TB_R_CUSTOMER b ON a.customer_id = b.customer_id
    LEFT JOIN TELCO.DATAMART.TB_F_SERVICE_USAGE c ON a.customer_id = c.customer_id
    LEFT JOIN TELCO.DATAMART.TB_F_REVENUE d ON a.customer_id = d.customer_id
    LEFT JOIN TELCO.DATAMART.TB_R_DATE f ON d.date_joined = f.date_id;

    RETURN '✅ Churn prediction table created successfully with DISTINCT records.';
END;
$$;

In [None]:
-- SELECT 
--     a.CUSTOMER_ID
--     ,a.PREDICTION_RESULTS
--     ,CASE 
--         WHEN a.PREDICTION_RESULTS = 'No' THEN NULL
--         ELSE SNOWFLAKE.CORTEX.COMPLETE(
--             'snowflake-arctic',
--             CONCAT(
--                 'You are a churn prediction analyst. ',
--                 'Given this customer profile, explain briefly (in 5–10 words) why this customer might churn: ',
--                 'Gender: ', b.GENDER, ', ',
--                 'Senior Citizen: ', b.SENIOR_CITIZEN, ', ',
--                 'Partner: ', b.PARTNER, ', ',
--                 'Dependents: ', b.DEPENDENTS, ', ',
--                 'Country: ', b.COUNTRY_CODE, ', ',
--                 'State: ', b.STATE, ', ',
--                 'City: ', b.CITY, ', ',
--                 'Latitude: ', b.LATITUDE, ', Longitude: ', b.LONGITUDE, ', ',
--                 'Phone Service: ', c.PHONE_SERVICE, ', ',
--                 'Multiple Lines: ', c.MULTIPLE_LINES, ', ',
--                 'Internet Service: ', c.INTERNET_SERVICE, ', ',
--                 'Online Security: ', c.ONLINE_SECURITY, ', ',
--                 'Online Backup: ', c.ONLINE_BACKUP, ', ',
--                 'Device Protection: ', c.DEVICE_PROTECTION, ', ',
--                 'Tech Support: ', c.TECH_SUPPORT, ', ',
--                 'Streaming TV: ', c.STREAMING_TV, ', ',
--                 'Streaming Movies: ', c.STREAMING_MOVIES, ', ',
--                 'Contract: ', c.CONTRACT_TYPE, ', ',
--                 'Paperless Billing: ', c.PAPERLESS_BILLING, ', ',
--                 'Payment Method: ', c.PAYMENT_METHOD, ', ',
--                 'Date Joined: ', d.DATE_JOINED, ', ',
--                 'Quarter: ', CONCAT('Q', f.quarter, '-', f.year), ', ',
--                 'Tenure (months): ', d.TENURE_MONTHS, ', ',
--                 'Monthly Charges: ', d.MONTHLY_CHARGES, ', ',
--                 'Total Charges: ', d.TOTAL_CHARGES, ', ',
--                 'CLTV: ', d.CLTV, ', ',
--                 'Birth Date: ', b.BIRTH_DATE, ', ',
--                 'Predicted churn status: ', a.PREDICTION_RESULTS, '.'
--             )
--         )
--     END AS CHURN_REASON
--     ,a.PREDICTION_DATE
-- FROM TELCO.DATAMART.TB_R_CHURN_PREDICTION a
-- LEFT JOIN TELCO.DATAMART.TB_R_CUSTOMER b ON a.customer_id = b.customer_id
-- LEFT JOIN TELCO.DATAMART.TB_F_SERVICE_USAGE c ON a.customer_id = c.customer_id
-- LEFT JOIN TELCO.DATAMART.TB_F_REVENUE d ON a.customer_id = d.customer_id
-- LEFT JOIN TELCO.DATAMART.TB_R_DATE f ON d.date_joined = f.date_id