In [138]:
# Library

import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import missingno

# stats test
from scipy.stats import normaltest

# train test split
from sklearn.model_selection import train_test_split

# menyatukan beberapa tahap preprocessing atau fungsi lainnya
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline

# encoder dan scaler
from sklearn.preprocessing import OneHotEncoder
from category_encoders import BinaryEncoder, OrdinalEncoder
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

# resampler
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE

# algoritma ML
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb

# cross validation untuk menentukan algoritma terbaik
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV

# metrics untuk regresi
from sklearn.metrics import f1_score, recall_score, classification_report, confusion_matrix, precision_recall_curve
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [139]:
df =  pd.read_csv('data_ecommerce_customer_churn.csv')

In [140]:
df.dropna(inplace=True)

In [141]:
duplicates = df.duplicated(keep=False)

duplicated_rows = df[duplicates]
duplicated_rows.sort_values('CashbackAmount')\

df = df.drop_duplicates(keep='last')
df

Unnamed: 0,Tenure,WarehouseToHome,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,DaySinceLastOrder,CashbackAmount,Churn
0,15.0,29.0,4,Laptop & Accessory,3,Single,2,0,7.0,143.32,0
1,7.0,25.0,4,Mobile,1,Married,2,0,7.0,129.29,0
2,27.0,13.0,3,Laptop & Accessory,1,Married,5,0,7.0,168.54,0
5,7.0,16.0,4,Mobile Phone,2,Divorced,2,0,11.0,152.81,0
6,1.0,15.0,6,Mobile Phone,5,Divorced,3,0,2.0,149.51,0
...,...,...,...,...,...,...,...,...,...,...,...
3935,26.0,7.0,4,Grocery,1,Married,4,0,9.0,295.99,0
3936,28.0,9.0,5,Fashion,3,Married,8,0,1.0,231.86,0
3937,8.0,7.0,2,Mobile Phone,2,Single,4,0,4.0,157.80,0
3938,30.0,6.0,5,Laptop & Accessory,3,Married,3,1,2.0,156.60,0


In [142]:
df['Churn'].value_counts()

Churn
0    2383
1     411
Name: count, dtype: int64

In [143]:
# Assuming df is your DataFrame and 'MaritalStatus' is the column of interest
df['MaritalStatus'] = df['MaritalStatus'].replace('Divorced', 'Single')

In [144]:
def calculate_outlier_bounds(data, factor=1.5):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    
    lower_bound = q1 - (factor * iqr)
    upper_bound = q3 + (factor * iqr)
    
    return lower_bound, upper_bound

# Assuming df is your DataFrame and 'CashbackAmount' is the column of interest
cashback_data = df['CashbackAmount'].values

lower_bound, upper_bound = calculate_outlier_bounds(cashback_data)

print("Lower bound:", lower_bound)
print("Upper bound:", upper_bound)


Lower bound: 81.82499999999995
Upper bound: 259.58500000000004


In [145]:
outliers = df[(df['CashbackAmount'] < lower_bound) | (df['CashbackAmount'] > upper_bound)]

display(outliers)


Unnamed: 0,Tenure,WarehouseToHome,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,DaySinceLastOrder,CashbackAmount,Churn
18,19.0,27.0,4,Grocery,1,Married,2,1,11.0,259.96,0
36,24.0,31.0,4,Grocery,3,Married,5,1,6.0,264.73,0
43,17.0,21.0,4,Grocery,5,Married,8,0,1.0,296.59,0
51,61.0,28.0,4,Grocery,1,Married,6,1,1.0,303.75,0
53,25.0,10.0,3,Grocery,2,Single,3,0,4.0,265.55,0
...,...,...,...,...,...,...,...,...,...,...,...
3869,7.0,24.0,4,Grocery,3,Single,3,0,1.0,299.44,0
3875,18.0,6.0,1,Others,1,Married,1,0,7.0,292.02,0
3889,31.0,36.0,5,Grocery,2,Married,5,0,1.0,266.18,0
3914,17.0,10.0,1,Others,5,Married,2,0,13.0,291.90,0


In [146]:
transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(), ['MaritalStatus']),
    ('binary', BinaryEncoder(), ['PreferedOrderCat']),
], remainder='passthrough')

In [147]:
transformer

In [148]:
x = df.drop(columns=['Churn'])
y = df['Churn']

In [149]:
x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,test_size=0.2,random_state=2021)

In [150]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier()
lgbm = lgb.LGBMClassifier()

# Sebelum Imbalance Treatment

In [151]:
models = [logreg,knn,dt,rf,xgb,lgbm]
score=[]
rata=[]
std=[]

for i in models:
    estimator=Pipeline([
        ('preprocess',transformer),
        ('model',i)])
    model_cv=cross_val_score(estimator,x_train,y_train,cv=5,scoring='precision')
    score.append(model_cv)
    rata.append(model_cv.mean())
    std.append(model_cv.std())
    
pd.DataFrame({'model':['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGBoost', 'LightGBM'],'Mean Precision':rata,'sdev':std}).set_index('model').sort_values(by='Mean Precision',ascending=False)

Unnamed: 0_level_0,Mean Precision,sdev
model,Unnamed: 1_level_1,Unnamed: 2_level_1
Random Forest,0.814596,0.070506
LightGBM,0.782811,0.058571
XGBoost,0.772246,0.06714
Logistic Regression,0.679761,0.073779
Decision Tree,0.613234,0.046279
KNN,0.562287,0.049176


# Setelah Imbalance Treatment

In [152]:
models = [logreg,knn,dt,rf,xgb,lgbm]
score=[]
rata=[]
std=[]

for i in models:
    estimator=Pipeline([
        ('preprocess',transformer),
        ('imbalance',SMOTE(random_state=2021)),
        ('model',i)])
    model_cv=cross_val_score(estimator,x_train,y_train,cv=5,scoring='precision')
    score.append(model_cv)
    rata.append(model_cv.mean())
    std.append(model_cv.std())
    
pd.DataFrame({'model':['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGBoost', 'LightGBM'],'Mean Precision':rata,'sdev':std}).set_index('model').sort_values(by='Mean Precision',ascending=False)

Unnamed: 0_level_0,Mean Precision,sdev
model,Unnamed: 1_level_1,Unnamed: 2_level_1
LightGBM,0.773405,0.089829
XGBoost,0.75378,0.055297
Random Forest,0.723976,0.049069
Decision Tree,0.577035,0.070124
Logistic Regression,0.394465,0.013627
KNN,0.348413,0.021409


In [153]:
from sklearn.metrics import precision_score, recall_score

models = [logreg, knn, dt, rf, xgb, lgbm]
score_precision = []

def y_pred_func(i):
    estimator = Pipeline([
        ('preprocess', transformer),
        ('model', i)])
    
    estimator.fit(x_train, y_train)
    return (estimator, estimator.predict(x_test), x_test, y_test)

for i, j in zip(models, ['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGBoost', 'LightGBM']):
    estimator, y_pred, x_test, y_test = y_pred_func(i)
    y_predict_proba = estimator.predict_proba(x_test)[:, 1]
    
    score_precision.append(precision_score(y_test, y_pred))
    
    print(j, '\n', classification_report(y_test, y_pred))
    
pd.DataFrame({'model': ['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGBoost', 'LightGBM'],
               'precision_score': score_precision}).set_index('model').sort_values(by='precision_score', ascending=False)


Logistic Regression 
               precision    recall  f1-score   support

           0       0.91      0.97      0.94       477
           1       0.71      0.45      0.55        82

    accuracy                           0.89       559
   macro avg       0.81      0.71      0.75       559
weighted avg       0.88      0.89      0.88       559

KNN 
               precision    recall  f1-score   support

           0       0.89      0.97      0.93       477
           1       0.63      0.33      0.43        82

    accuracy                           0.87       559
   macro avg       0.76      0.65      0.68       559
weighted avg       0.85      0.87      0.86       559

Decision Tree 
               precision    recall  f1-score   support

           0       0.94      0.93      0.93       477
           1       0.61      0.66      0.64        82

    accuracy                           0.89       559
   macro avg       0.78      0.79      0.78       559
weighted avg       0.89      0

Unnamed: 0_level_0,precision_score
model,Unnamed: 1_level_1
Random Forest,0.872727
XGBoost,0.842857
LightGBM,0.835616
Logistic Regression,0.711538
KNN,0.627907
Decision Tree,0.613636


In [154]:
from imblearn.over_sampling import SMOTE

models = [logreg, knn, dt, rf, xgb, lgbm]
score_precision = []

def y_pred_func(i):
    estimator = Pipeline([
        ('preprocess', transformer),
        ('smote',SMOTE(random_state=2021)),
        ('model', i)])
    
    estimator.fit(x_train, y_train)
    return (estimator, estimator.predict(x_test), x_test, y_test)

for i, j in zip(models, ['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGBoost', 'LightGBM']):
    estimator, y_pred, x_test, y_test = y_pred_func(i)
    y_predict_proba = estimator.predict_proba(x_test)[:, 1]
    
    score_roc_auc.append(roc_auc_score(y_test, y_predict_proba))
    score_precision.append(precision_score(y_test, y_pred))
    
    print(j, '\n', classification_report(y_test, y_pred))
    
pd.DataFrame({'model': ['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGBoost', 'LightGBM'],
               'precision_score': score_precision}).set_index('model').sort_values(by='precision_score', ascending=False)

Logistic Regression 
               precision    recall  f1-score   support

           0       0.97      0.80      0.88       477
           1       0.43      0.87      0.57        82

    accuracy                           0.81       559
   macro avg       0.70      0.83      0.73       559
weighted avg       0.89      0.81      0.83       559

KNN 
               precision    recall  f1-score   support

           0       0.96      0.81      0.88       477
           1       0.42      0.78      0.54        82

    accuracy                           0.81       559
   macro avg       0.69      0.80      0.71       559
weighted avg       0.88      0.81      0.83       559

Decision Tree 
               precision    recall  f1-score   support

           0       0.93      0.93      0.93       477
           1       0.60      0.62      0.61        82

    accuracy                           0.88       559
   macro avg       0.77      0.78      0.77       559
weighted avg       0.89      0

Unnamed: 0_level_0,precision_score
model,Unnamed: 1_level_1
XGBoost,0.828947
Random Forest,0.796875
LightGBM,0.792208
Decision Tree,0.6
Logistic Regression,0.427711
KNN,0.415584


### LIGHTGBM

In [155]:
from lightgbm import LGBMClassifier

# Define pipeline
pipeline = Pipeline([
    ('preprocess', transformer),
    ('smote', SMOTE(random_state=2021)),
    ('model', LGBMClassifier(random_state=42))
])

# Define more complex hyperparameters
hyperparameters = {
    'model__boosting_type': ['gbdt', 'dart', 'goss'],
    'model__num_leaves': [20, 30, 40],
    'model__learning_rate': [0.01, 0.1, 0.5],
    'model__n_estimators': [100, 200, 300]
}

# Create GridSearchCV object
gridLGBM = GridSearchCV(pipeline, hyperparameters, cv=5, scoring='precision')

# Fit the model on the training data
gridLGBM.fit(x_train, y_train)

# Get the best parameters
best_paramsLGBM = gridLGBM.best_params_
best_modelLGBM = gridLGBM.best_estimator_

print(f"Best parameters: {best_paramsLGBM}")

Best parameters: {'model__boosting_type': 'gbdt', 'model__learning_rate': 0.1, 'model__n_estimators': 300, 'model__num_leaves': 20}


In [156]:
gridLGBM.best_score_

0.7953528274322436

### XGBOOST

In [157]:
from xgboost import XGBClassifier

# Define pipeline
pipeline = Pipeline([
    ('preprocess', transformer),
    ('smote', SMOTE(random_state=2021)),
    ('model', XGBClassifier(random_state=42))
])

# Define more complex hyperparameters
hyperparameters = {
    'model__max_depth': [3, 4, 5],
    'model__learning_rate': [0.01, 0.1, 0.5],
    'model__n_estimators': [100, 200, 300],
    'model__gamma': [0, 0.1, 0.2]
}

# Create GridSearchCV object
gridXGB = GridSearchCV(pipeline, hyperparameters, cv=5, scoring='precision')

# Fit the model on the training data
gridXGB.fit(x_train, y_train)

# Get the best parameters
best_paramsXGB = gridXGB.best_params_
best_modelXGBOOST = gridXGB.best_estimator_

print(f"Best parameters: {best_paramsXGB}")


Best parameters: {'model__gamma': 0.2, 'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200}


In [158]:
gridXGB.best_score_

0.7681626673002298

### HT RF

In [159]:
from sklearn.ensemble import RandomForestClassifier

# Define pipeline
pipeline = Pipeline([
    ('preprocess', transformer),
    ('smote', SMOTE(random_state=2021)),
    ('model', RandomForestClassifier(random_state=42))
])

# Define more complex hyperparameters
hyperparameters = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [3, 4, 5],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

# Create GridSearchCV object
gridRF = GridSearchCV(pipeline, hyperparameters, cv=5, scoring='precision')

# Fit the model on the training data
gridRF.fit(x_train, y_train)

# Get the best parameters
best_paramsRF = gridRF.best_params_
best_modelRF = gridRF.best_estimator_

print(f"Best parameters: {best_paramsRF}")


Best parameters: {'model__max_depth': 5, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}


In [160]:
gridRF.best_score_

0.5986988207311607

In [161]:
display(best_paramsLGBM,
        best_paramsXGB,
        best_paramsRF)

{'model__boosting_type': 'gbdt',
 'model__learning_rate': 0.1,
 'model__n_estimators': 300,
 'model__num_leaves': 20}

{'model__gamma': 0.2,
 'model__learning_rate': 0.1,
 'model__max_depth': 5,
 'model__n_estimators': 200}

{'model__max_depth': 5,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__n_estimators': 100}

In [162]:
LGBM_AT = LGBMClassifier(boosting_type = 'gbdt',
                         learning_rate = 0.1,
                         n_estimators= 300,
                         num_leaves = 20)
XGB_AT = XGBClassifier(gamma = 0.1,
                       learning_rate = 0.5,
                       max_depth = 5,
                       n_estimators=200)
RF_AT = RandomForestClassifier(max_depth = 3,
                               min_samples_leaf = 1,
                               min_samples_split = 10,
                               n_estimators=100)

modelAT = [LGBM_AT,XGB_AT,RF_AT]
train_precisiondummy = [(gridLGBM.best_score_),(gridXGB.best_score_),(gridRF.best_score_)]
test_precisiondummy = []

# Construct the pipeline
for i in modelAT:
    estimator= Pipeline([
    ('preprocess',transformer),
    ('smote', SMOTE(random_state=2021)),
    ('model', i)])
    
    estimator.fit(x_train, y_train)

    # Compute precision on the test set
    test_pred = estimator.predict(x_test)
    test_precision = precision_score(y_test, test_pred)
    test_precisiondummy.append(test_precision)

    cm = confusion_matrix(y_train, train_pred)
    print(cm)

# Display The Results
pd.DataFrame({'model': ['LightGBM', 'XGBoost', 'Random Forest'],
               'Train Precision': train_precisiondummy,
               'Test Precision': test_precisiondummy}).set_index('model').sort_values(by='Test Precision', ascending=False)

[[1659  247]
 [  94  235]]
[[1659  247]
 [  94  235]]
[[1659  247]
 [  94  235]]


Unnamed: 0_level_0,Train Precision,Test Precision
model,Unnamed: 1_level_1,Unnamed: 2_level_1
LightGBM,0.795353,0.853333
XGBoost,0.768163,0.810811
Random Forest,0.598699,0.59434
