# Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Feature Engineering
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
# import category_encoders as ce

# Evaluation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, auc, precision_score, recall_score, confusion_matrix

# Data

In [2]:
df_telco = pd.read_csv('Telco_customer_churn_clean.csv')

In [3]:
df_telco.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Additional Internet Service t,streamer,p1,p2,p3,p4,p5,p6,p7,problem
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,1,control,0,1,0,1,1,0,0,identified
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,0,control,1,1,0,1,1,1,0,identified
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,1,control,0,1,0,1,1,1,0,identified
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,1,control,0,0,0,1,1,1,0,identified
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,1,control,0,0,0,1,1,0,0,identified


# Preprocessing Pipeline

In [4]:
# features
cat = [
    'Gender', 
    'Senior Citizen',
    'Partner', 
    'Dependents',
    'Phone Service',
    'Multiple Lines', 
    'Internet Service', 
    'Online Security',
    'Online Backup', 
    'Device Protection', 
    'Tech Support', 
    'Streaming TV',
    'Streaming Movies', 
    'Contract', 
    'Paperless Billing', 
    'Payment Method'
]
num = [
    'Latitude', 
    'Longitude',
    'Monthly Charges', 
    'Total Charges',
    'Tenure Months',
    'Tenure Years',
    'newly onboard',
    'Avg Charges',
    'Charge drop',
    'Charge diff'
]

# ML preprocessing Pipeline
transformer = ColumnTransformer([
                                 ('one hot encoder',OneHotEncoder(drop = 'first',handle_unknown = 'error'),cat),
                                 # ('binary encoder',binary_encoder_pipeline,['workclass','marital.status','occupation','native.country'])
                                ], remainder = 'passthrough')

# data for splitting
X = df_telco[cat + num]
y = df_telco['Churn Value']

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y, train_size = .8,random_state = 2020)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, stratify = y_train, train_size = .8,random_state = 2020)

# Model Fitting

In [5]:
# Model Selection
model = LogisticRegression(random_state = 2025)
estimator = Pipeline([
                      ('preprocess',transformer),
                      ('clf',model)
])

In [6]:
estimator.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Evaluate : Data Mining Succes Criteria

## Criteria

highest f1 score when both recall and precision above 0.6

## ML Performance

In [7]:
def metrics_by_threshold(th, estimator, X_val, y_val):
    y_prob = estimator.predict_proba(X_val)
    y_pred = np.where(y_prob[:,1] > th,1,0)

    f1_result = f1_score(y_val, y_pred)
    precision_result = precision_score(y_val, y_pred)
    recall_result = recall_score(y_val, y_pred)
    
    performance = {
        'f1':f1_result,
        'precision':precision_result,
        'recall':recall_result
    }
    return performance

In [8]:
metrics_by_threshold(0.5, estimator, X_val, y_val)

{'f1': 0.5838264299802761,
 'precision': 0.7115384615384616,
 'recall': 0.49498327759197325}

In [9]:
metrics_by_threshold(0.5, estimator, X_train, y_train)

{'f1': 0.5883449883449884,
 'precision': 0.6649104320337197,
 'recall': 0.5275919732441472}

## Threshold Optimization

In [10]:
def performance_th(range_th):

    performance_recap = []
    
    for i in range_th:
        performance = metrics_by_threshold(i, estimator, X_val, y_val)
        performance['threshold'] = i
        performance_recap.append(performance)

    return performance_recap

def get_best_th(performance_recap):
    
    # data mining criteria
    # highest f1 score when both recall and precision above 60%
    
    performance_recap = pd.DataFrame(performance_recap)
    performance_recap_filtered = performance_recap[(performance_recap['recall']>0.6)&(performance_recap['precision']>0.6)] 
    performance_recap_filtered = performance_recap_filtered.sort_values('f1', ascending = False)
    best_th = performance_recap_filtered['threshold'][0:1].values[0]

    return best_th

def view_performance_th_recap(performance_recap):
    performance_recap = pd.DataFrame(performance_recap)
    performance_recap = performance_recap[['threshold','f1','recall','precision']]
    return performance_recap

In [11]:
performance_recap = performance_th([i/100 for i in range(20,60,5)])
best_th = get_best_th(performance_recap)

In [12]:
performance_recap = view_performance_th_recap(performance_recap)

In [13]:
print(best_th)

0.35


In [14]:
performance_recap.to_excel('performance_recap.xlsx',index = False)

# Evaluate : Business Succes Criteria

## Criteria

decrease loss up to 50 %

*with certain assumption

In [15]:
def eval_monetary_impact(estimator, X_val, y_val, best_th, scale, retention_cost, campaign_effectivity):
    y_prob = estimator.predict_proba(X_val)
    y_pred = np.where(y_prob[:,1] > best_th,1,0)

    potential_monthly_charge_loss = round(scale*sum((y_val)*X_val['Monthly Charges']))
    undetected_monthly_charge_loss = round(scale*sum(((y_val) & (1-y_pred))*X_val['Monthly Charges']))
    wasted_retention_cost = round(scale*sum(((1-y_val) & (y_pred))*X_val['Monthly Charges']*retention_cost))
    potentialy_saved_monthly_charge = round(campaign_effectivity*scale*sum(((y_val) & (y_pred))*X_val['Monthly Charges'])-(sum(X_val['Monthly Charges']*retention_cost*y_val*y_pred)))
    potentialy_saved_monthly_charge_pct = round(100*(potentialy_saved_monthly_charge - wasted_retention_cost)/ potential_monthly_charge_loss,2)

    business_success_criteria = {
        'threshold':best_th,
        'potential revenue loss':potential_monthly_charge_loss,
        'undetected revenue loss':undetected_monthly_charge_loss,
        'wasted retention cost':wasted_retention_cost,
        'potentially retained revenue':potentialy_saved_monthly_charge,
        'potentially retained revenue(%)':potentialy_saved_monthly_charge_pct
    }
    return business_success_criteria

In [16]:
scale = df_telco.shape[0]/X_val.shape[0]
retention_cost = 0.15
campaign_effectivity = 0.8
business_success_criteria = eval_monetary_impact(estimator, X_val, y_val, best_th, scale, retention_cost, campaign_effectivity)
business_success_criteria

{'threshold': 0.35,
 'potential revenue loss': 136616,
 'undetected revenue loss': 41372,
 'wasted retention cost': 8754,
 'potentially retained revenue': 73910,
 'potentially retained revenue(%)': 47.69}