In [51]:
import pandas as pd
import numpy as np

# Bank Telemarketing Campaign - Opening Deposit Classification

### Context 

Term Deposit adalah investasi deposito keuangan dengan jangka waktu tertentu pada sebuah institusi keuangan/bank. Term Deposit Investment berkisar antara 1 bulan hingga beberapa tahun dan memiliki nilai minimum yang berbeda - beda. Dengan bantuan telemarketing, bank dapat memasarkan berbagai macam produk dan jasanya secara langsung kepada customer. 

### Problem Statement

Telemarketing merupakan cara pemasaran produk yang banyak digunakan banyak perusahaan di berbagai macam industri saat ini. Strategi Telemarketing dinilai mampu meningkatkan pendapatan dan pencapaian target pada bank dikarenakan lebih efisien dari segi waktu dan jarak. 
Bank ingin melakukan telemarketing ke klien yang kemungkinan besar akan membuka deposito.

### Goals 

Berdasarkan permasalahan tersebut, bank ingin memprediksi kemungkinan seorang klien akan membuka deposito atau tidak. Selain itu pihak marketing bank juga ingin dapat mengidentifikasi potensi klien menggunakan model yang sudah dibangun berdasarkan fitur - fitur penting yang memengaruhi keputusan. 

### Analytic Approach

Jadi yang akan dilakukan adalah menganalisis data untuk menemukan pola yang membedakan klien yang akan membuka deposito dan yang tidak.
Kemudian akan dibangun model klasifikasi yang akan membantu tim marketing bank untuk dapat memprediksi probabilitas seorang klien akan membuka deposito atau tidak.

### Metric Evaluation

![../input/images/metric_bank.png](attachement:../input/images/metric_bank.png)

* Type I Error (False Positive) : Kerugian waktu dan sumber daya dalam melakukan telemarketing
* Type II Error (False Negative) : Kehilangan calon customer potensial



In [52]:
df = pd.read_csv('../input/bank-marketing-campaigns-dataset/bank-additional-full.csv', sep=';')
df

In [53]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(18,22))

plt.subplot(421)
sns.histplot(data=df, x='age', hue='y', kde=True, bins=20)
plt.title('Age Histogram')

plt.subplot(422)
sns.boxplot(data=df, y='age', x='y')
plt.title('Age Boxplot')

plt.subplot(423)
sns.histplot(data=df, x='duration', hue='y', kde=True, bins=20)
plt.title('Duration Histogram')

plt.subplot(424)
sns.boxplot(data=df, y='duration', x='y')
plt.title('Duration Boxplot')

plt.subplot(425)
sns.histplot(data=df, x='campaign', hue='y', kde=True, bins=20)
plt.title('Campaign Histogram')

plt.subplot(426)
sns.boxplot(data=df, y='campaign', x='y')
plt.title('Campaign Boxplot')

plt.subplot(427)
sns.histplot(data=df, x='pdays', hue='y', kde=True, bins=20)
plt.title('Pdays Histogram')

plt.subplot(428)
sns.boxplot(data=df, y='pdays', x='y')
plt.title('Pdays Boxplot');

In [54]:
plt.figure(figsize=(18,22))

plt.subplot(421)
sns.histplot(data=df, x='emp.var.rate', hue='y', kde=True, bins=20)
plt.title('emp.var.rate Histogram')

plt.subplot(422)
sns.boxplot(data=df, y='emp.var.rate', x='y')
plt.title('emp.var.rate Boxplot')

plt.subplot(423)
sns.histplot(data=df, x='cons.price.idx', hue='y', kde=True, bins=20)
plt.title('cons.price.idx Histogram')

plt.subplot(424)
sns.boxplot(data=df, y='cons.price.idx', x='y')
plt.title('cons.price.idx Boxplot')

plt.subplot(425)
sns.histplot(data=df, x='cons.conf.idx', hue='y', kde=True, bins=20)
plt.title('cons.conf.idx Histogram')

plt.subplot(426)
sns.boxplot(data=df, y='cons.conf.idx', x='y')
plt.title('cons.conf.idx Boxplot')

plt.subplot(427)
sns.histplot(data=df, x='euribor3m', hue='y', kde=True, bins=20)
plt.title('euribor3m Histogram')

plt.subplot(428)
sns.boxplot(data=df, y='euribor3m', x='y')
plt.title('euribor3m Boxplot');

In [55]:
listItem = []
for col in df.columns :
    listItem.append([col, df[col].dtype, df[col].isna().sum(), round((df[col].isna().sum()/len(df[col])) * 100,2),
                    df[col].nunique(), list(df[col].drop_duplicates().sample(2).values)]);

dfDesc = pd.DataFrame(columns=['dataFeatures', 'dataType', 'null', 'nullPct', 'unique', 'uniqueSample'],
                     data=listItem)
dfDesc

Dalam dataset ini missing data ditulis sebagai unknown. Untuk EDA dan modeling nilai unknown akan dikategorikan sebagai kategori tersendiri. Hal tersebut diasumsikan klien tidak ingin memberikan informasi tertentu.

In [56]:
df['y'] = np.where(df['y'] == 'no', 0, 1)

In [57]:
# correlation matric for numeric columns
def corr_to_target(dataframe, target, title=None, file=None):
    plt.figure(figsize=(6,8))
    sns.set(font_scale=1)
    
    sns.heatmap(dataframe.corr()[[target]].sort_values(target, ascending=False),
               annot=True, cmap='coolwarm')
    
    if title: plt.title(f'\n{title}\n', fontsize=18)
    plt.xlabel('')
    plt.ylabel('')
    if file: plt.savefig(file, bbox_inches='tight')
    plt.show();

In [58]:
corr_to_target(df, 'y', 'Correlation to Target')

In [59]:
from scipy.stats import normaltest
for col in df.select_dtypes(include=[np.number]):
    print(f"{col} : {normaltest(df[col])[1]}")

tidak ada kolom yang terdistribusi normal.

In [60]:
from scipy.stats import chi2_contingency

for col in df.select_dtypes(include=['object']).columns:
    print(f"Independence test untuk kolom {col} dan y(target) : ")
    alpha = 0.05
    stat, p, dof, expected = chi2_contingency(pd.crosstab(df[col], df['y']))
    print(f"p value is " + str(p))
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (H0 true) ==> X')
    print('-------------------------')

kolom `housing` dan `loan` independen terhadap kolom target.

In [61]:
pd.crosstab(df['housing'], df['y'])*100/len(df)

In [62]:
pd.crosstab(df['loan'], df['y'])*100/len(df)

In [63]:
campaign_cat = []
for val in df['campaign']:
    if val in range(1,4):
        val = '=<4 calls'
        campaign_cat.append(val)
    elif val in range(4,8):
        val = '4-8 calls'
        campaign_cat.append(val)
    else:
        val = '>8 calls'
        campaign_cat.append(val)

In [64]:
df['campaign'] = campaign_cat
df

In [65]:
print("Independence test untuk kolom campaign dan y(target) : ")
alpha = 0.05
stat, p, dof, expected = chi2_contingency(pd.crosstab(df['campaign'], df['y']))
print(f"p value is " + str(p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 true)')

In [66]:
# turn numerical columns into categorical columns
df['emp.var.rate'] = pd.cut(df['emp.var.rate'], bins=4, labels=False)
df['cons.conf.idx'] = pd.cut(df['cons.conf.idx'], bins=4, labels=False)
df['cons.price.idx'] = pd.cut(df['cons.price.idx'], bins=4, labels=False)
df['nr.employed'] = pd.cut(df['nr.employed'], bins=4, labels=False)
df['euribor3m'] = pd.cut(df['euribor3m'], bins=4, labels=False)

In [67]:
binning_cols = ['emp.var.rate', 'cons.conf.idx', 'cons.price.idx', 'nr.employed', 'euribor3m']
for col in binning_cols:
    print(f"Independence test untuk kolom {col} dan y(target) : ")
    alpha = 0.05
    stat, p, dof, expected = chi2_contingency(pd.crosstab(df[col], df['y']))
    print(f"p value is " + str(p))
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (H0 true) ==> X')
    print('-------------------------')

In [68]:
for i in df.select_dtypes(include=['object']).columns:
    relevent_experience_df = df.groupby(i)['y'].value_counts(normalize=True).unstack()
    display(relevent_experience_df.sort_values(by=[1.0], ascending=False))

In [69]:
df['pdays'] = np.where(df['pdays'] == 999, 0, 1)

In [70]:
df['previous'] = np.where(df['previous'] >= 1, 1, 0)
df['previous'].value_counts()

In [71]:
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, plot_roc_curve, f1_score
from sklearn.ensemble import RandomForestClassifier

In [72]:
df.drop(columns=['duration', 'housing', 'loan'], inplace=True)
df

In [73]:
transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(drop='first'), ['default', 'previous', 'marital', 'contact', 'day_of_week', 
                                             'education', 'job', 'poutcome', 'campaign', 'month']),
], remainder='passthrough')

In [74]:
X = df.drop(columns=['y'])
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Logistic Regression

In [75]:
logreg = LogisticRegression(solver='liblinear')

estimator = Pipeline([
    ('transformer', transformer),
    ('model', logreg)
    ])

In [76]:
estimator.fit(X_train, y_train)

In [77]:
y_pred  = estimator.predict(X_test)
print(classification_report(y_test, y_pred))

In [78]:
plot_roc_curve(estimator, X_test, y_test);

## Balanced Logistic Regression

In [79]:
logreg_balanced = LogisticRegression(solver='liblinear', class_weight='balanced')

estimator_balanced = Pipeline([
    ('transformer', transformer),
    ('model', logreg_balanced)
    ])

In [80]:
estimator_balanced.fit(X_train, y_train)

In [81]:
y_pred  = estimator_balanced.predict(X_test)
print(classification_report(y_test, y_pred))

In [82]:
plot_roc_curve(estimator_balanced, X_test, y_test);

function untuk melihat nama kolom yang telah diproses dalam transformer

In [83]:
import sklearn
def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [84]:
get_feature_names(transformer)

In [85]:
coef_table = pd.DataFrame(get_feature_names(transformer)).copy()
coef_table.insert(len(coef_table.columns),"Coefs",estimator_balanced.steps[1][1].coef_[0].transpose())
coef_table.sort_values(by=['Coefs'], ascending=False)

## Random Forest Classifier

In [86]:
rfc = RandomForestClassifier(random_state=42)

estimator_rfc = Pipeline([
    ('transformer', transformer),
    ('model rfc', rfc)
    ])

In [87]:
estimator_rfc.fit(X_train, y_train)

In [88]:
y_pred  = estimator_rfc.predict(X_test)
print(classification_report(y_test, y_pred))

In [89]:
plot_roc_curve(estimator_rfc, X_test, y_test);