In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# processing libraries 
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, chi2

# model libraries 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


from sklearn.model_selection import train_test_split
from sklearn import metrics 

In [None]:
bank_churner_df = pd.read_csv("./data/bank_churner.csv")
bank_churner_df_org = bank_churner_df.copy()

def test_transform(x_test):
    ''' 전처리 함수 정의'''
    
    # 불필요 컬럼 제거(고객번호)
    # -------------------------
    # x_test = x_test.drop('cstno', axis=1)
    
    
    # 성별 변환('F':0, 'M':1)
    # -------------------------
    #x_test['sex']=x_test['sex'].replace({'F':0,'M':1})
    
    
    # # 다중공선성 컬럼 제거
    # # -------------------------
    # x_test = x_test.drop('mon_on_book', axis = 1)
    # x_test = x_test.drop('mean_open_to_buy', axis = 1)
    # x_test = x_test.drop('tot_trans_cnt_for_12m', axis = 1)
    
    
    # Null 처리
    x_test.drop(columns = ['mean_util_pct'], inplace=True)
    x_test.dropna(axis=0, inplace=True)
    
    return x_test

bank_churner_df_org = test_transform(bank_churner_df_org)
df = bank_churner_df_org.copy()

In [None]:
df.head(4).style.set_properties(**{"background-color": "#b2c4cc","color": "black", "border-color": "black", "font-size":"8pt", 'width': 200})

In [None]:
df.describe().style.set_properties(**{"background-color": "#b2c4cc","color": "black", "border-color": "black", "font-size":"8pt", 'width': 200})

In [None]:
# number of churn and non-churn 
counts = df.is_churned.value_counts()
perc_churn = (counts[1] / (counts[0] + counts[1])) * 100

# no. of duplicates 
duplicates = len(df[df.duplicated()])

# no of missing values
missing_values = df.isnull().sum().sum()

# Data types in dataset
types = df.dtypes.value_counts()

In [None]:
print("Churn Rate = %.1f %%"%(perc_churn))
print('Number of Duplicate Entries: %d'%(duplicates))
print('Number of Missing Values: %d'%(missing_values))
print('Number of Features: %d'%(df.shape[1]))
print('Number of Customers: %d'%(df.shape[0]))
print('Data Types and Frequency in Dataset:')
print(types)

In [None]:
#Code adapted from https://www.kaggle.com/code/winternguyen/churning-customers-98-95-detected#Step-2.-Exploratory-data-analysis

heat = df.corr()
plt.figure(figsize = [16,8])
plt.title("Correlation between numerical features", size = 25, pad = 20, color = '#8cabb6')
sns.heatmap(heat, cmap = sns.diverging_palette(20, 220, n = 200), annot = False)
plt.show()

Key Point: 'Avg_open_to_buy' and 'Credit_Limit' are highly correlated.
Key Point: 'Total_Trans_Amt' and 'Total_Trans_Ct' are closely correlated.
Key Point: 'Total_Revolving_Bal', 'Credit_Limit' and 'Avg_open_to_buy' all seem to be connect to the 'Avg_Utilization_Ratio'.

In [None]:
df.info()

In [None]:
# make gender and outcome numerical
df['sex'] = df['sex'].map({'M': 1, 'F': 0})

# drop client id
df = df.drop('cstno', axis = 1)

# Code adapted from: https://www.kaggle.com/code/andreshg/churn-prediction-0-99-auc-h2o-sklearn-smote#2.-Feature-Engeneering
catcols = df.select_dtypes(exclude = ['int64','float64']).columns
intcols = df.select_dtypes(include = ['int64']).columns
floatcols = df.select_dtypes(include = ['Float64']).columns

# one-hot encoding on categorical columns
df = pd.get_dummies(df, columns = catcols)

# minmax scaling numeric features 
for col in df[floatcols]:
    df[col] = MinMaxScaler().fit_transform(df[[col]])
    
for col in df[intcols]:
    df[col] = MinMaxScaler().fit_transform(df[[col]])
    
print('New Number of Features: %d'%(df.shape[1]))  

In [None]:
# split into X and y
X = df.drop('is_churned', axis = 1)
y = df['is_churned']

## Correlation Coefficients Ranking

In [None]:
heat = df.corr()
plt.figure(figsize=[16,8])
plt.title("Correlation between numerical features", size = 25, pad = 20, color = '#8cabb6')
sns.heatmap(heat,cmap = sns.diverging_palette(20, 220, n = 200), annot=False)
plt.show()

In [None]:
# Code adapted from: https://www.kaggle.com/code/winternguyen/churning-customers-98-95-detected#Step-4:-Feature-Selection

print("Correlation Coefficient of all the Features")
corr = df.corr()
corr.sort_values(["is_churned"], ascending = False, inplace = True)
correlations = corr.is_churned
a = correlations[correlations > 0.1]
b = correlations[correlations < -0.1]
top_corr_features = a.append(b)
top_corr_features

In [None]:
# Code adapted from: https://www.kaggle.com/code/andreshg/churn-prediction-0-99-auc-h2o-sklearn-smote#4.-Feature-Selection

def plot_importances(model, model_name, features_to_plot, feature_names):
    #fit model and performances
    model.fit(X,y)
    importances = model.feature_importances_
    
    # sort and rank importances
    indices = np.argsort(importances)
    best_features = np.array(feature_names)[indices][-features_to_plot:]
    values = importances[indices][-features_to_plot:]
    
    # plot a graph
    y_ticks = np.arange(0, features_to_plot)
    fig, ax = plt.subplots()
    ax.barh(y_ticks, values, color = '#b2c4cc')
    ax.set_yticklabels(best_features)
    ax.set_yticks(y_ticks)
    ax.set_title("%s Feature Importances"%(model_name))
    fig.tight_layout()
    plt.show()
    
def best_features(model, features_to_plot, feature_names):
    # get list of best features 
    model.fit(X,y)
    importances = model.feature_importances_

    indices = np.argsort(importances)
    best_features = np.array(feature_names)[indices][-features_to_plot:]
    return best_features

In [None]:
feature_names = list(X.columns)

model1 = RandomForestClassifier(random_state = 1234)
plot_importances(model1, 'Random Forest', 10, feature_names)

model2 = GradientBoostingClassifier(n_estimators = 100, learning_rate = 1.0, max_depth = 1, random_state = 0)
plot_importances(model2, 'XGBoost', 10, feature_names)

model3 = AdaBoostClassifier(n_estimators = 100, learning_rate = 1.0, random_state = 0)
plot_importances(model3, 'AdaBoost', 10, feature_names)

## Using selectkbest to find the best parameters

In [None]:
# looking at the F-value between label/feature for classification tasks
f_selector = SelectKBest(f_classif, k = 10)
f_selector.fit_transform(X, y)
f_selector_best = f_selector.get_feature_names_out()
print(f_selector_best)

## Choose best components

In [None]:
forest_best = list(best_features(model1, 10, feature_names))
XG_best = list(best_features(model2, 10, feature_names))
ada_best = list(best_features(model3, 10, feature_names))
top_corr_features = list(top_corr_features.index[1:])
f_selector_best = list(f_selector_best)

In [None]:
best_features_overall = forest_best + XG_best + ada_best + top_corr_features + f_selector_best

# create a dictionary with the number of times features appear 
from collections import Counter
count_best_features = dict(Counter(best_features_overall))

# list of the features without any repeatitions
features_no_repeats = list(dict.fromkeys(best_features_overall))

display(count_best_features)

In [None]:
# get list of features with high counts in the dictionary
def get_features(threshold):
    # remove features below a certain number of appearances
    chosen_features = []
    for i in features_no_repeats:
        if count_best_features[i] > threshold:
            chosen_features.append(i)
    return chosen_features

In [None]:
chosen_features = get_features(2)
chosen_features.remove('mean_open_to_buy')
#chosen_features.remove('Avg_Utilization_Ratio')
chosen_features

## Find the best threshold for features

In [None]:
def eval_model(model, model_name, X, y, threshold):
    # make X the chosen subset
    chosen_features = get_features(threshold)
    X = X[chosen_features]
    
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.25, random_state = 42)
    
    # fit model
    model.fit(train_x,train_y)
    model.score(test_x, test_y)
    pred_test = model.predict(test_x)
    
    # get metrics
    f1 = metrics.f1_score(test_y, pred_test)
    test_acc = metrics.accuracy_score(test_y, pred_test)
    con = metrics.confusion_matrix(test_y, pred_test)
    
    print(con,'%s model with %s threshold: %.4f F1-score and %.4f accuracy'%(model_name, threshold, f1, test_acc))

In [None]:
# run ranges of possible thresholds
for i in range(0,5):
   eval_model(model1, 'forest', X, y, i)
    
for i in range(0,5):
   eval_model(model2, 'XGBoost', X, y, i)

for i in range(0,5):
   eval_model(model3, 'AdaBoost', X, y, i)

- The best threshold is 2 in the top two classfiers but is 1 for the AdaBoost so we are going with 2. The threshold of 2 also removes 'credit_limit' that is highly correlated with 'Avg_Open_To_Buy' adverting that issue.

## Compare with full dataset

In [None]:
# full dataset
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.25, random_state = 42)
    
model1.fit(train_x,train_y)
model1.score(test_x, test_y)
pred_test = model1.predict(test_x)
    
f1 = metrics.f1_score(test_y, pred_test)
test_acc = metrics.accuracy_score(test_y, pred_test)
con = metrics.confusion_matrix(test_y, pred_test)
    
print(con,f1,test_acc)

In [None]:
### reduced dataset
chosen_features = get_features(2)
#chosen_features.remove('Avg_Open_To_Buy')
#chosen_features.remove('Avg_Utilization_Ratio')
Xnew = X[chosen_features]

train_x, test_x, train_y, test_y = train_test_split(Xnew, y, test_size = 0.25, random_state = 42)
    
model1.fit(train_x,train_y)
model1.score(test_x, test_y)
pred_test = model1.predict(test_x)
    
f1 = metrics.f1_score(test_y, pred_test)
test_acc = metrics.accuracy_score(test_y, pred_test)
con = metrics.confusion_matrix(test_y, pred_test)
    
print(con,f1,test_acc)

## Find best model

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
model4 = BaggingClassifier(KNeighborsClassifier(n_neighbors = 7), max_samples = 0.8, max_features = 0.8)

eval_model(model4, 'KNN', X, y, 2)

In [None]:
from sklearn.linear_model import LogisticRegression

model5 = LogisticRegression(random_state=1)
eval_model(model5, 'Logistic', X, y, 2)

- The two above have very low F1-scores suggesting issues with the model.

From the tests run above the random forest is the best performing model so we will hypertune these parameters.

## Hypertune parameters

In [None]:
# lists of possible parameters
n = [400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]
depth = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
rand = [600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250]

forest = RandomForestClassifier(n_estimators = 100, max_depth = 15, random_state = 750)
eval_model(forest, 'forest', X, y, 2)

In [None]:
def eval_forest(model, model_name, X, y, threshold, n, depth, rand):
    # create subset from feature selection
    chosen_features = get_features(threshold)
    # chosen_features.remove('Avg_Open_To_Buy')
    # chosen_features.remove('Avg_Utilization_Ratio')
    X = X[chosen_features]
    
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.25, random_state = 42)
    
    model.fit(train_x,train_y)
    model.score(test_x, test_y)
    pred_test = model.predict(test_x)
    
    f1 = metrics.f1_score(test_y, pred_test)
    test_acc = metrics.accuracy_score(test_y, pred_test)
    con = metrics.confusion_matrix(test_y, pred_test)
    
    print('Model: %s Threshold: %s F1-Score %.4f Accuracy: %.4f n_estimators: %s depth: %s rand: %s'%(model_name, threshold, f1, test_acc,n,depth,rand))

In [None]:
# run for loops for each parameter and carry forward the best one 
for i in n:
   forest = RandomForestClassifier(n_estimators = i, max_depth = 10, random_state = 750)
   eval_forest(forest, 'forest', X, y, 2, i, 10, 750)

In [None]:
for i in depth:
   forest = RandomForestClassifier(n_estimators = 850, max_depth = i, random_state = 750)
   eval_forest(forest, 'forest', X, y, 2, 850, i, 750) 

In [None]:
for i in rand:
   forest = RandomForestClassifier(n_estimators = 850, max_depth = 19, random_state = i)
   eval_forest(forest, 'forest', X, y, 2, 850, 19, i) 

## Apply final model

In [None]:
forest = RandomForestClassifier(n_estimators = 850, max_depth = 19, random_state = 1200)

In [None]:
chosen_features = get_features(2)
# chosen_features.remove('Avg_Open_To_Buy')
# chosen_features.remove('Avg_Utilization_Ratio')
X_new = X[chosen_features]

train_x, test_x, train_y, test_y = train_test_split(X_new, y, test_size = 0.25, random_state = 42)
    
forest.fit(train_x,train_y)
forest.score(test_x, test_y)
pred_test = forest.predict(test_x)
    
f1 = metrics.f1_score(test_y, pred_test)
test_acc = metrics.accuracy_score(test_y, pred_test)
con = metrics.confusion_matrix(test_y, pred_test)
precision = metrics.precision_score(test_y, pred_test)
recall = metrics.recall_score(test_y, pred_test)
roc = metrics.roc_auc_score(test_y, pred_test)

In [None]:
print('Accuracy Score', test_acc)
print('Precision', precision)
print('Recall', recall)
print('F1-Score', f1)
print('ROC Score', roc)
print(con)

In [None]:
#cross validation
from sklearn.model_selection import cross_validate 

cv_results = cross_validate(forest, X_new, y, scoring = ('f1', 'accuracy', 'roc_auc'), cv = 8)
sorted(cv_results.keys())

In [None]:
cv_results['test_roc_auc'] 

In [None]:
cv_results['test_f1'] 

In [None]:
cv_results['test_accuracy'] 