# Classification

- Dataset: Bank Marketing Data Set https://archive.ics.uci.edu/ml/datasets/bank+marketing

In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from typing import Union
import calendar


DF_RAW = pd.read_csv(
    "bank-full.csv",
    sep=';'
)


def expand_to_one_hot_vector(dataframe, column_name: str):
    """Expand the category value to a One-Hot vector."""
    one_hot_vector = pd.get_dummies(
        dataframe[column_name],
        prefix=column_name
    )
    df_dropped = dataframe.drop([column_name], axis=1)
    return pd.concat(
        [
            df_dropped,
            one_hot_vector
        ],
        axis=1
    )


def numerate_binary_labels(
    dataframe,
    column_name: str,
    label_0: Union[str, bool] = 'no',
    label_1: Union[str, bool] = 'yes'
) -> None:
    """Numerate binary labels in a dataframe column.
    """
    if dataframe[column_name].dtypes == np.bool_:
        dataframe[column_name] = dataframe[column_name].astype(np.int_)
        return

    if set(dataframe[column_name].values) == {0, 1}:
        return

    dataframe[column_name] = dataframe[column_name].map({
        label_1: 1,
        label_0: 0,
    })
    return


In [2]:
# Confirm dataset content
display(DF_RAW.head())

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
print(f"shape:{DF_RAW.shape}\n")
print(f"y:\n{DF_RAW['y'].value_counts()}\n")
print(
    f"Sales success rate = {DF_RAW['y'].value_counts()['yes']/len(DF_RAW)}"
)

shape:(45211, 17)

y:
no     39922
yes     5289
Name: y, dtype: int64

Sales success rate = 0.11698480458295547


In [4]:
print(DF_RAW.isnull().sum())

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [5]:
# Numeric multivalued labels. (Expand one category to multiple categories)
df = DF_RAW.copy()

df = expand_to_one_hot_vector(df, 'job')
df = expand_to_one_hot_vector(df, 'marital')
df = expand_to_one_hot_vector(df, 'education')
df = expand_to_one_hot_vector(df, 'contact')
df = expand_to_one_hot_vector(df, 'poutcome')

display(df.head())

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,...,education_secondary,education_tertiary,education_unknown,contact_cellular,contact_telephone,contact_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,no,2143,yes,no,5,may,261,1,-1,...,0,1,0,0,0,1,0,0,0,1
1,44,no,29,yes,no,5,may,151,1,-1,...,1,0,0,0,0,1,0,0,0,1
2,33,no,2,yes,yes,5,may,76,1,-1,...,1,0,0,0,0,1,0,0,0,1
3,47,no,1506,yes,no,5,may,92,1,-1,...,0,0,1,0,0,1,0,0,0,1
4,33,no,1,no,no,5,may,198,1,-1,...,0,0,1,0,0,1,0,0,0,1


In [6]:
# Numerate binary labels.
numerate_binary_labels(df, 'default')
numerate_binary_labels(df, 'housing')
numerate_binary_labels(df, 'loan')
numerate_binary_labels(df, 'y')

display(df.head())

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,...,education_secondary,education_tertiary,education_unknown,contact_cellular,contact_telephone,contact_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,0,2143,1,0,5,may,261,1,-1,...,0,1,0,0,0,1,0,0,0,1
1,44,0,29,1,0,5,may,151,1,-1,...,1,0,0,0,0,1,0,0,0,1
2,33,0,2,1,1,5,may,76,1,-1,...,1,0,0,0,0,1,0,0,0,1
3,47,0,1506,1,0,5,may,92,1,-1,...,0,0,1,0,0,1,0,0,0,1
4,33,0,1,0,0,5,may,198,1,-1,...,0,0,1,0,0,1,0,0,0,1


In [7]:
# Replace month names(jan, feb, ...) with numbers 1~12.
df['month'].unique()

array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'jan', 'feb',
       'mar', 'apr', 'sep'], dtype=object)

In [8]:
month_names = [
    month[0:3].lower() for month in calendar.month_name
][1:]

df['month'] = df['month'].map(
    {month_names[i]: (i + 1) for i in range(0, len(month_names))}
)

display(df)

Unnamed: 0,age,default,balance,housing,loan,day,month,duration,campaign,pdays,...,education_secondary,education_tertiary,education_unknown,contact_cellular,contact_telephone,contact_unknown,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,0,2143,1,0,5,5,261,1,-1,...,0,1,0,0,0,1,0,0,0,1
1,44,0,29,1,0,5,5,151,1,-1,...,1,0,0,0,0,1,0,0,0,1
2,33,0,2,1,1,5,5,76,1,-1,...,1,0,0,0,0,1,0,0,0,1
3,47,0,1506,1,0,5,5,92,1,-1,...,0,0,1,0,0,1,0,0,0,1
4,33,0,1,0,0,5,5,198,1,-1,...,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,0,825,0,0,17,11,977,3,-1,...,0,1,0,1,0,0,0,0,0,1
45207,71,0,1729,0,0,17,11,456,2,-1,...,0,0,0,1,0,0,0,0,0,1
45208,72,0,5715,0,0,17,11,1127,5,184,...,1,0,0,1,0,0,0,0,1,0
45209,57,0,668,0,0,17,11,508,4,-1,...,1,0,0,0,1,0,0,0,0,1
