# 0.0 IMPORTS

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display  import HTML

## 0.1 Helper Function

In [2]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [3]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## 0.2 Loading Data

In [4]:
df_marketing_raw = pd.read_csv('data/bank-full.csv', sep=";", low_memory=False)
df_marketing_raw.sample(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
40737,45,admin.,divorced,university.degree,no,no,yes,telephone,sep,wed,173,1,999,0,nonexistent,-1.1,94.199,-37.5,0.876,4963.6,no
35901,50,admin.,divorced,university.degree,no,yes,yes,cellular,may,mon,644,1,999,0,nonexistent,-1.8,92.893,-46.2,1.264,5099.1,no
11210,50,blue-collar,married,basic.4y,unknown,yes,no,telephone,jun,thu,56,2,999,0,nonexistent,1.4,94.465,-41.8,4.961,5228.1,no
2028,36,entrepreneur,married,university.degree,no,yes,no,telephone,may,mon,59,4,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
5550,33,blue-collar,single,basic.9y,unknown,no,yes,telephone,may,mon,788,11,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,yes


# 1.0 - DESCRIPTION OF DATA

In [5]:
df1 = df_marketing_raw.copy()

## 1.1 Rename Columns

In [6]:
df1.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [7]:
cols_old = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 
            'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']

cols_new = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'passed_days', 'previous', 'previous_outcome', 
            'employment_var_rate', 'consumer_price_idx', 'consumer_conf_idx', 'euribor_3m', 'nr_employed', 'y']

# rename
df1.columns = cols_new

In [8]:
df1.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign',
       'passed_days', 'previous', 'previous_outcome', 'employment_var_rate',
       'consumer_price_idx', 'consumer_conf_idx', 'euribor_3m', 'nr_employed',
       'y'],
      dtype='object')

## 1.2 Data Dimensions

In [9]:
print( 'Number of Rows: {}'.format( df1.shape[0] ) )
print( 'Number of Cols: {}'.format( df1.shape[1] ) )
# Evaluate the possibilite do use this project in your computer

Number of Rows: 41188
Number of Cols: 21


## 1.3 Data Types

In [10]:
df1.dtypes

age                      int64
job                     object
marital                 object
education               object
default                 object
housing                 object
loan                    object
contact                 object
month                   object
day_of_week             object
duration                 int64
campaign                 int64
passed_days              int64
previous                 int64
previous_outcome        object
employment_var_rate    float64
consumer_price_idx     float64
consumer_conf_idx      float64
euribor_3m             float64
nr_employed            float64
y                       object
dtype: object

## 1.4 Check NA

In [11]:
df1.isna().sum()

age                    0
job                    0
marital                0
education              0
default                0
housing                0
loan                   0
contact                0
month                  0
day_of_week            0
duration               0
campaign               0
passed_days            0
previous               0
previous_outcome       0
employment_var_rate    0
consumer_price_idx     0
consumer_conf_idx      0
euribor_3m             0
nr_employed            0
y                      0
dtype: int64

In [12]:
# There are several missing values in some categorical attributes, all coded with the "unknown" label. 
# These missing values can be treated as a possible class label or using deletion or imputation techniques.

# In this first moment I will transform "unknown" to NaN
df1.replace( {'unknown': np.nan}, inplace=True )

In [13]:
df1.isna().sum()

age                       0
job                     330
marital                  80
education              1731
default                8597
housing                 990
loan                    990
contact                   0
month                     0
day_of_week               0
duration                  0
campaign                  0
passed_days               0
previous                  0
previous_outcome          0
employment_var_rate       0
consumer_price_idx        0
consumer_conf_idx         0
euribor_3m                0
nr_employed               0
y                         0
dtype: int64

## 1.5 Fillout NA

In [None]:
# job


# marital


# education


# default


# housing


# loan



In [15]:
df1[df1['job'].isna()==True]

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,passed_days,previous,previous_outcome,employment_var_rate,consumer_price_idx,consumer_conf_idx,euribor_3m,nr_employed,y
29,55,,married,university.degree,,,,telephone,may,mon,362,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
35,55,,married,basic.4y,,yes,no,telephone,may,mon,336,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
73,57,,married,,,no,no,telephone,may,mon,211,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
91,57,,married,,,yes,no,telephone,may,mon,48,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
144,38,,divorced,high.school,,yes,no,telephone,may,mon,73,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
299,38,,married,,,no,no,telephone,may,mon,362,4,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
303,43,,married,,no,yes,no,telephone,may,mon,267,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
343,57,,married,,,yes,no,telephone,may,mon,325,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
388,28,,single,,,yes,yes,telephone,may,tue,1201,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,yes
428,50,,married,,,yes,no,telephone,may,tue,185,3,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
