In [112]:
import pandas as pd
import numpy as np

# Bank Telemarketing Campaign - Opening Deposit Classification

### Context 

Term Deposit adalah investasi deposito keuangan dengan jangka waktu tertentu pada sebuah institusi keuangan/bank. Term Deposit Investment berkisar antara 1 bulan hingga beberapa tahun dan memiliki nilai minimum yang berbeda - beda. Dengan bantuan telemarketing, bank dapat memasarkan berbagai macam produk dan jasanya secara langsung kepada customer. 

### Problem Statement

Telemarketing merupakan cara pemasaran produk yang banyak digunakan banyak perusahaan di berbagai macam industri saat ini. Strategi Telemarketing dinilai mampu meningkatkan pendapatan dan pencapaian target pada bank dikarenakan lebih efisien dari segi waktu dan jarak. 
Bank ingin melakukan telemarketing ke klien yang kemungkinan besar akan membuka deposito.

### Goals 

Berdasarkan permasalahan tersebut, bank ingin memprediksi kemungkinan seorang klien akan membuka deposito atau tidak. Selain itu pihak marketing bank juga ingin dapat mengoptimalkan potensi klien yang akan membuka rekening deposito. 

### Analytic Approach

Jadi yang akan dilakukan adalah menganalisis data untuk menemukan pola yang membedakan klien yang akan membuka deposito dan yang tidak.
Kemudian akan dibangun model klasifikasi yang akan membantu tim marketing bank untuk dapat memprediksi probabilitas seorang klien akan membuka deposito atau tidak.

### Metric Evaluation

![../input/images/metric_bank.png](attachement:../input/images/metric_bank.png)

* Type I Error (False Positive) : Kerugian waktu dan sumber daya dalam melakukan telemarketing
* Type II Error (False Negative) : Kehilangan calon customer potensial



In [167]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, plot_roc_curve, f1_score, roc_auc_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb

In [113]:
df = pd.read_csv('../input/bank-marketing-campaigns-dataset/bank-additional-full.csv', sep=';')
df

## Data Visualization

In [114]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(18,22))

plt.subplot(421)
sns.histplot(data=df, x='age', hue='y', kde=True, bins=20)
plt.title('Age Histogram')

plt.subplot(422)
sns.boxplot(data=df, y='age', x='y')
plt.title('Age Boxplot')

plt.subplot(423)
sns.histplot(data=df, x='duration', hue='y', kde=True, bins=20)
plt.title('Duration Histogram')

plt.subplot(424)
sns.boxplot(data=df, y='duration', x='y')
plt.title('Duration Boxplot')

plt.subplot(425)
sns.histplot(data=df, x='campaign', hue='y', kde=True, bins=20)
plt.title('Campaign Histogram')

plt.subplot(426)
sns.boxplot(data=df, y='campaign', x='y')
plt.title('Campaign Boxplot')

plt.subplot(427)
sns.histplot(data=df, x='pdays', hue='y', kde=True, bins=20)
plt.title('Pdays Histogram')

plt.subplot(428)
sns.boxplot(data=df, y='pdays', x='y')
plt.title('Pdays Boxplot');

In [115]:
plt.figure(figsize=(18,22))

plt.subplot(421)
sns.histplot(data=df, x='emp.var.rate', hue='y', kde=True, bins=20)
plt.title('emp.var.rate Histogram')

plt.subplot(422)
sns.boxplot(data=df, y='emp.var.rate', x='y')
plt.title('emp.var.rate Boxplot')

plt.subplot(423)
sns.histplot(data=df, x='cons.price.idx', hue='y', kde=True, bins=20)
plt.title('cons.price.idx Histogram')

plt.subplot(424)
sns.boxplot(data=df, y='cons.price.idx', x='y')
plt.title('cons.price.idx Boxplot')

plt.subplot(425)
sns.histplot(data=df, x='cons.conf.idx', hue='y', kde=True, bins=20)
plt.title('cons.conf.idx Histogram')

plt.subplot(426)
sns.boxplot(data=df, y='cons.conf.idx', x='y')
plt.title('cons.conf.idx Boxplot')

plt.subplot(427)
sns.histplot(data=df, x='euribor3m', hue='y', kde=True, bins=20)
plt.title('euribor3m Histogram')

plt.subplot(428)
sns.boxplot(data=df, y='euribor3m', x='y')
plt.title('euribor3m Boxplot');

In [116]:
listItem = []
for col in df.columns :
    listItem.append([col, df[col].dtype, df[col].isna().sum(), round((df[col].isna().sum()/len(df[col])) * 100,2),
                    df[col].nunique(), list(df[col].drop_duplicates().sample(2).values)]);

dfDesc = pd.DataFrame(columns=['dataFeatures', 'dataType', 'null', 'nullPct', 'unique', 'uniqueSample'],
                     data=listItem)
dfDesc

Dalam dataset ini missing data ditulis sebagai unknown. Untuk EDA dan modeling nilai unknown akan dikategorikan sebagai kategori tersendiri. Hal tersebut diasumsikan klien tidak ingin memberikan informasi tertentu.

In [117]:
# mengubah kolom target kedalam bentuk numerikal
df['y'] = np.where(df['y'] == 'no', 0, 1)

## Correlation Matrix dan Hypothesis Testing

In [118]:
# correlation metric for numeric columns
def corr_to_target(dataframe, target, title=None, file=None):
    plt.figure(figsize=(6,8))
    sns.set(font_scale=1)
    
    sns.heatmap(dataframe.corr()[[target]].sort_values(target, ascending=False),
               annot=True, cmap='coolwarm')
    
    if title: plt.title(f'\n{title}\n', fontsize=18)
    plt.xlabel('')
    plt.ylabel('')
    if file: plt.savefig(file, bbox_inches='tight')
    plt.show();

In [119]:
corr_to_target(df, 'y', 'Correlation to Target')

Selain dengan visualisasi kita dapat melakukan normality test untuk melihat apakah fitur-fitur yang digunakan terdistribusi normal atau tidak.

In [120]:
from scipy.stats import normaltest
for col in df.select_dtypes(include=[np.number]):
    print(f"{col} : {normaltest(df[col])[1]}")

tidak ada kolom yang terdistribusi normal.

Selanjutnya akan kita lakukan chi squared test untuk melihat apakah masing - masing fitur berhubungan secara dependen atau independen terhadap kolom target.

In [121]:
from scipy.stats import chi2_contingency

for col in df.select_dtypes(include=['object']).columns:
    print(f"Independence test untuk kolom {col} dan y(target) : ")
    alpha = 0.05
    stat, p, dof, expected = chi2_contingency(pd.crosstab(df[col], df['y']))
    print(f"p value is " + str(p))
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (H0 true) ==> X')
    print('-------------------------')

kolom `housing` dan `loan` independen terhadap kolom target.

In [122]:
pd.crosstab(df['housing'], df['y'])*100/len(df)

In [123]:
pd.crosstab(df['loan'], df['y'])*100/len(df)

In [124]:
pd.crosstab(df['housing'], df['y'])*100/len(df)

## Mengubah kolom numerikal ke bentuk kategorikal (Binning)

In [125]:
campaign_cat = []
for val in df['campaign']:
    if val in range(1,4):
        val = '=<4 calls'
        campaign_cat.append(val)
    elif val in range(4,8):
        val = '4-8 calls'
        campaign_cat.append(val)
    else:
        val = '>8 calls'
        campaign_cat.append(val)

In [126]:
df['campaign'] = campaign_cat
df

kolom `campaign` telah diubah ke bentuk kategorikal...

Selanjutnya akan kita lihat korelasinya dengan kolom target.

In [127]:
print("Independence test untuk kolom campaign dan y(target) : ")
alpha = 0.05
stat, p, dof, expected = chi2_contingency(pd.crosstab(df['campaign'], df['y']))
print(f"p value is " + str(p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 true)')

untuk kolom `emp.var.rate`, `cons.price.idx`,`cons.conf.idx`, `euribor3m`, `nr.employed` dapat diubah kedalam bentuk kategorikal

In [128]:
for col in ['emp.var.rate', 'cons.price.idx','cons.conf.idx', 'euribor3m', 'nr.employed']:
    print(f"jumlah nilai unik sebanyak : {df[col].nunique()}")
print(f"jumlah data sebanyak : {len(df)}")

dapat kita lihat bahwa dari 5 kolom numerik tersebut tidak memiliki nilai yang terlalu beragam dibandingkan dengan jumlah datanya. Maka dari itu masing - masing dapat kita ubah ke bentuk kategorikal dengan 4 interval.

In [129]:
df['emp.var.rate'] = pd.cut(df['emp.var.rate'], bins=4, labels=False)
df['cons.conf.idx'] = pd.cut(df['cons.conf.idx'], bins=4, labels=False)
df['cons.price.idx'] = pd.cut(df['cons.price.idx'], bins=4, labels=False)
df['nr.employed'] = pd.cut(df['nr.employed'], bins=4, labels=False)
df['euribor3m'] = pd.cut(df['euribor3m'], bins=4, labels=False)

Selanjutnya akan kita lihat korelasinya dengan kolom target.

In [130]:
binning_cols = ['emp.var.rate', 'cons.conf.idx', 'cons.price.idx', 'nr.employed', 'euribor3m']
for col in binning_cols:
    print(f"Independence test untuk kolom {col} dan y(target) : ")
    alpha = 0.05
    stat, p, dof, expected = chi2_contingency(pd.crosstab(df[col], df['y']))
    print(f"p value is " + str(p))
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (H0 true) ==> X')
    print('-------------------------')

Berikut dapat kita lihat proporsi dari 2 kelas kolom target terhadap masing - masing fitur kategorikal. 

In [131]:
for i in df.select_dtypes(include=['object']).columns:
    relevent_experience_df = df.groupby(i)['y'].value_counts(normalize=True).unstack()
    display(relevent_experience_df.sort_values(by=[1.0], ascending=False))

Untuk kolom `pdays` dan `previous` dapat kita ubah ke kategori 1 dan 0.

In [132]:
print(f"Persentase klien yang tidak pernah dihubungi di campaign sebelumnya: {len(df[df['pdays'] == 999])*100/len(df)}%")
print(f"Persentase klien pernah dihubungi di campaign sebelumnya: {len(df[df['pdays'] != 999])*100/len(df)}%")
print(f"Jumlah data klien seluruhnya: {len(df)}")

In [133]:
df['pdays'] = np.where(df['pdays'] == 999, 0, 1)

In [134]:
df['previous'] = np.where(df['previous'] >= 1, 1, 0)
df['previous'].value_counts()

Berdasarkan independence test kita dapat tidak mengikutsertakan kolom `housing` dan `loan` untuk modeling. Kolom `duration` juga tidak relevan untuk dijadikan fitur karena data terkait lama waktu telemarketing dapat diketahui apabila telemarketing telah dilakukan, sedangkan dalam membuat predictive model tidak perlu dilakukan telemarketing terlebih dahulu.

In [136]:
df.drop(columns=['duration', 'housing', 'loan'], inplace=True)
df

## Data Preprocessing and Data Splitting

* Kolom Target : `y`
* Jumlah kolom fitur : 17
* Kolom numerikal : `age`
* Kolom fitur ordinal : `emp.var.rate`, `cons.price.idx`,`cons.conf.idx`, `euribor3m`, `nr.employed`
* kolom fitur one hot encoding : `default`, `previous`, `marital`, `contact`, `day_of_week`, `education`, `job`, `poutcome`, `campaign`, `month`

In [137]:
transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(drop='first'), ['default', 'previous', 'marital', 'contact', 'day_of_week', 
                                             'education', 'job', 'poutcome', 'campaign', 'month']),
], remainder='passthrough')

In [138]:
X = df.drop(columns=['y'])
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Modeling

In [139]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier()
lgbm = lgb.LGBMClassifier()

In [140]:
models = [logreg,knn,dt,rf,xgb,lgbm]
score=[]
rata=[]
std=[]

for i in models:
    skfold=StratifiedKFold(n_splits=5)
    estimator=Pipeline([('preprocess',transformer),('model',i)])
    model_cv=cross_val_score(estimator,X_train,y_train,cv=skfold,scoring='roc_auc')
    score.append(model_cv)
    rata.append(model_cv.mean())
    std.append(model_cv.std())
 
pd.DataFrame({'model':['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGBoost', 
                       'LightGBM'],'mean roc_auc':rata,'sdev':std}).set_index('model').sort_values(by='mean roc_auc',ascending=False)

In [141]:
models = [logreg,knn,dt,rf,xgb,lgbm]
score_roc_auc = []

def y_pred_func(i):
    estimator=Pipeline([
        ('preprocess',transformer),
        ('model',i)])
    X_train,X_test
    
    estimator.fit(X_train,y_train)
    return(estimator,estimator.predict(X_test),X_test)

for i,j in zip(models, ['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGBoost','LightGBM']):
    estimator,y_pred,X_test = y_pred_func(i)
    y_predict_proba = estimator.predict_proba(X_test)[:,1]
    score_roc_auc.append(roc_auc_score(y_test,y_predict_proba))
    print(j,'\n', classification_report(y_test,y_pred))
    
pd.DataFrame({'model':['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGBoost','LightGBM'],
             'roc_auc score':score_roc_auc}).set_index('model').sort_values(by='roc_auc score',ascending=False)

### Logistic Regression

In [142]:
logreg = LogisticRegression()

estimator = Pipeline([
    ('transformer', transformer),
    ('model', logreg)
    ])

In [143]:
estimator.fit(X_train, y_train)

In [144]:
y_pred  = estimator.predict(X_test)
print(classification_report(y_test, y_pred))

In [145]:
plot_roc_curve(estimator, X_test, y_test);

### Balanced Logistic Regression

In [146]:
logreg_balanced = LogisticRegression(class_weight='balanced')

estimator_balanced = Pipeline([
    ('transformer', transformer),
    ('model', logreg_balanced)
    ])

In [147]:
estimator_balanced.fit(X_train, y_train)

In [148]:
y_pred  = estimator_balanced.predict(X_test)
print(classification_report(y_test, y_pred))

In [149]:
plot_roc_curve(estimator_balanced, X_test, y_test);

function untuk melihat nama kolom yang telah diproses dalam transformer

In [150]:
import sklearn
def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [151]:
# get_feature_names(transformer)

In [152]:
print(f"Jumlah fitur X_train : {len(X_train.columns)}")
print(f"Jumlah fitur setelah di pre-processing : {len(get_feature_names(transformer))}")

In [153]:
coef_table = pd.DataFrame(get_feature_names(transformer)).copy()
coef_table.insert(len(coef_table.columns),"nilai koefisien fitur",estimator_balanced.steps[1][1].coef_[0].transpose())
coef_table.sort_values(by=['nilai koefisien fitur'], ascending=False)

## Random Forest Classifier

In [154]:
rfc = RandomForestClassifier(random_state=42)

estimator_rfc = Pipeline([
    ('transformer', transformer),
    ('model rfc', rfc)
    ])

In [155]:
estimator_rfc.fit(X_train, y_train)

In [156]:
y_pred  = estimator_rfc.predict(X_test)
print(classification_report(y_test, y_pred))

In [157]:
plot_roc_curve(estimator_rfc, X_test, y_test);

In [158]:
estimator_rfc.steps[1][1].feature_importances_

In [159]:
plt.figure(figsize=(14,12))

f_i = list(zip(get_feature_names(transformer),estimator_rfc.steps[1][1].feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i])
plt.title("Random Forest Feature Importance")

plt.show()

### Hyperparameter Tuning

In [160]:
# logreg_ht = LogisticRegression(class_weight='balanced')

# estimator_logreg_ht = Pipeline([
#     ('preprocess',transformer),
#     ('model',logreg_ht)
# ])

In [161]:
# hyperparam_space = [{
#     'model__solver': ['newton-cg', 'lbfgs', 'liblinear'], 
#     'model__penalty': ['l2'], 
#     'model__C': [100, 10, 1.0, 0.1, 0.01]
# }]

In [162]:
# grid = GridSearchCV(estimator_logreg_ht, n_jobs=-1, param_grid=hyperparam_space, 
#                     scoring='roc_auc', cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42))

In [163]:
# grid.fit(X_train,y_train)
# print(grid.best_score_)
# print(grid.best_params_)

In [164]:
# best_model = grid.best_estimator_
# best_model.fit(X_train, y_train)

In [165]:
# estimator_logreg_ht.fit(X_train, y_train)

In [166]:
# y_pred_default = estimator_logreg_ht.predict(X_test)
# y_pred_proba_default = estimator_logreg_ht.predict_proba(X_test)
# y_pred_tuned = best_model.predict(X_test)
# y_pred_proba_tuned = best_model.predict_proba(X_test)

# roc_auc_default = roc_auc_score(y_test, y_pred_proba_default[:,1])
# roc_auc_tuned = roc_auc_score(y_test, y_pred_proba_tuned[:,1])

# print('ROC AUC Score Default Logistic Regression : ', roc_auc_default)
# print('ROC AUC Score Tuned Logistic Regression : ', roc_auc_tuned)