In [1]:
import joblib
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split

from sklearn.metrics import classification_report, roc_auc_score

!pip install lightgbm
!pip install xgboost

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)



In [2]:
data=pd.read_csv("customer_data.csv")

In [3]:
data.head()

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Transaction_Count_per_Contact,Loyal_Customer,Creditworthiness,Cr_Util_Rate,Attrition_Flag
0,0.0,-1.0,0.5,-1.0,1.0,0.5,0.0,0.0,0.5,-1.0,1.0,0.956476,-0.350175,0.988927,2.627193,-1.065558,-0.694444,3.911017,-0.239583,-0.5,0.0,-0.284692,-0.238398,0
1,0.0,0.0,1.5,0.0,0.0,-0.5,0.0,0.0,1.0,-1.0,0.0,0.435477,-0.289123,0.459078,3.530702,-1.008702,-0.944444,12.762712,-0.147917,-0.666667,0.0,-0.618989,-0.147921,0
2,0.0,-1.0,0.5,0.0,1.0,1.0,0.0,0.0,0.0,-1.0,-2.0,-0.132863,-0.895439,-0.006562,8.149123,-0.778186,-1.305556,6.911017,-0.366667,-0.907407,0.0,2.281101,-0.365955,0
3,-1.0,0.0,1.0,-1.0,0.0,-0.5,0.0,-1.0,-0.5,2.0,-1.0,-0.145198,0.870877,-0.313785,2.934211,-1.055115,-1.305556,6.911017,1.216667,-0.537037,-1.0,-0.228374,1.2169,0
4,-1.0,-1.0,0.5,-1.0,1.0,0.5,0.0,-2.0,0.5,-1.0,-2.0,0.019618,-0.895439,0.145527,6.311404,-1.192419,-1.083333,7.618644,-0.366667,-1.277778,-1.0,0.723351,-0.365955,0


In [4]:
y = data["Attrition_Flag"]
X = data.drop(["Attrition_Flag"], axis=1)

In [26]:
def models(X, y, scoring="roc_auc"):
    print("Models....")
    classifiers = [("RF", RandomForestClassifier()),
                   ('XGBoost', XGBClassifier()),
                   ('LightGBM', LGBMClassifier()),
                   ]

    for name, classifier in classifiers:
        cv_results = cross_validate(classifier, X, y, cv=3, scoring=scoring)
        print(f"{scoring}: {round(cv_results['test_score'].mean(), 4)} ({name}) ")

In [27]:
models(X, y, scoring="accuracy")

Models....
accuracy: 0.8599 (RF) 
accuracy: 0.8857 (XGBoost) 
accuracy: 0.8875 (LightGBM) 


In [32]:
rf_params = {"max_depth": [5, 8,15,20, None],
             "max_features": [3, 5, 7,9, "auto"],
             "min_samples_split": [2, 5, 8, 15, 20],
             "n_estimators": [100, 200,300, 500]}

xgboost_params = {"learning_rate": [0.1, 0.01],
                  "max_depth": [5, 8],
                  "n_estimators": [100, 200, 500, 1000],
                  "colsample_bytree": [0.7, 1]}

lightgbm_params = {"learning_rate": [0.01, 0.1],
               "n_estimators": [100, 300, 500, 1000],
               "colsample_bytree": [0.5, 0.7, 1]}

In [33]:
classifiers = [("RF", RandomForestClassifier(), rf_params),
               ('XGBoost', XGBClassifier(), xgboost_params),
               ('LightGBM', LGBMClassifier(), lightgbm_params)]


In [34]:
def hyperparameter_optimization(X, y, cv=3, scoring="roc_auc"):
    print("Hyperparameter Optimization....")
    best_models = {}
    for name, classifier, params in classifiers:
        print(f"########## {name} ##########")
        cv_results = cross_validate(classifier, X, y, cv=cv, scoring=scoring)
        print(f"{scoring} (Before): {round(cv_results['test_score'].mean(), 4)}")

        gs_best = GridSearchCV(classifier, params, cv=cv, n_jobs=-1, verbose=False).fit(X, y)
        final_model = classifier.set_params(**gs_best.best_params_)

        cv_results = cross_validate(final_model, X, y, cv=cv, scoring=scoring)
        print(f"{scoring} (After): {round(cv_results['test_score'].mean(), 4)}")
        print(f"{name} best params: {gs_best.best_params_}", end="\n\n")
        best_models[name] = final_model
    return best_models


In [35]:
best_models = hyperparameter_optimization(X, y)

Hyperparameter Optimization....
########## RF ##########
roc_auc (Before): 0.8658
roc_auc (After): 0.8731
RF best params: {'max_depth': 15, 'max_features': 9, 'min_samples_split': 2, 'n_estimators': 100}

########## XGBoost ##########
roc_auc (Before): 0.8955
roc_auc (After): 0.8974
XGBoost best params: {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 1000}

########## LightGBM ##########
roc_auc (Before): 0.9051
roc_auc (After): 0.8989
LightGBM best params: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'n_estimators': 500}



## Ensemble Learning

In [36]:
def voting_classifier(best_models, X, y):
    print("Voting Classifier...")

    voting_clf = VotingClassifier(estimators=[('XGBoost', best_models['XGBoost']),
                                              ('RF', best_models["RF"]),
                                              ('LightGBM', best_models["LightGBM"])],
                                  voting='soft').fit(X, y)

    cv_results = cross_validate(voting_clf, X, y, cv=3, scoring=["accuracy", "f1", "roc_auc"])
    print(f"Accuracy: {cv_results['test_accuracy'].mean()}")
    print(f"F1Score: {cv_results['test_f1'].mean()}")
    print(f"ROC_AUC: {cv_results['test_roc_auc'].mean()}")
    return voting_clf

In [37]:
voting_clf = voting_classifier(best_models, X, y)

Voting Classifier...
Accuracy: 0.8901960973611841
F1Score: 0.6913498287708686
ROC_AUC: 0.8824864435836633


In order to increase success, hyperparameter optimization can be done again at wider intervals.

In order to improve the f1 score, the unbalanced data set can be balanced and the model can be established again.

### SMOTE

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

In [39]:
sm = SMOTE(random_state = 1, sampling_strategy = 1.0)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [40]:
 classifiers = [("RF", RandomForestClassifier()),
                ('XGBoost', XGBClassifier()),
                ('LightGBM', LGBMClassifier()),
                ]

In [47]:
from sklearn.metrics import accuracy_score

predictions_df = pd.DataFrame()
predictions_df['actual_labels'] = y_test

for name,classifier in classifiers:
    classifier = classifier
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    predictions_df[name.strip(" :")] = predictions
    print(name, accuracy_score(y_test, predictions))
    

RF 0.9619940769990128
XGBoost 0.9674234945705824
LightGBM 0.9718657453109576


In [46]:
clf1 = RandomForestClassifier()
clf2 = XGBClassifier()
clf3 = LGBMClassifier()
eclf1 = VotingClassifier(estimators=[('RF', clf1), ('XGBoost', clf2), ('LightGBM', clf3)], voting='soft')
eclf1.fit(X_train, y_train)
predictions = eclf1.predict(X_test)
print(accuracy_score(y_test, predictions))
##REF: https://www.kaggle.com/code/nilanml/telecom-customer-churn-voting-80-1-accuracy

0.9679170779861797


In [49]:
from sklearn.metrics import f1_score
print(f1_score(y_test, predictions))

0.9147982062780268


After the data is balanced, our f1 score value has also increased.

Estimation was carried out by asking 3 models. As a result of these 3 models, the f1 score is 0.914.