# 0.0 IMPORTS

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display  import HTML

## 0.1 Helper Function

In [3]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [12]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## 0.2 Loading Data

In [15]:
df_marketing_raw = pd.read_csv('data/bank-full.csv', sep=";", low_memory=False)
df_marketing_raw.sample(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
36,34,services,married,high.school,no,no,no,telephone,may,mon,365,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
6210,52,management,married,professional.course,no,yes,yes,telephone,may,tue,280,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
40342,31,technician,single,unknown,no,yes,no,cellular,aug,mon,428,2,9,3,success,-1.7,94.027,-38.3,0.898,4991.6,yes
31878,29,entrepreneur,married,high.school,no,no,no,telephone,may,thu,325,1,999,0,nonexistent,-1.8,92.893,-46.2,1.327,5099.1,no
13446,56,blue-collar,married,basic.4y,unknown,yes,no,telephone,jul,wed,300,1,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,no


# 1.0 - DESCRIPTION OF DATA

In [16]:
df1 = df_marketing_raw.copy()

## 1.1 Rename Columns

In [17]:
df1.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [18]:
cols_old = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 
            'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']

cols_new = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'passed_days', 'previous', 'previous_outcome', 
            'employment_var_rate', 'consumer_price_idx', 'consumer_conf_idx', 'euribor_3m', 'nr_employed', 'y']

# rename
df1.columns = cols_new

In [19]:
df1.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign',
       'passed_days', 'previous', 'previous_outcome', 'employment_var_rate',
       'consumer_price_idx', 'consumer_conf_idx', 'euribor_3m', 'nr_employed',
       'y'],
      dtype='object')

## 1.2 Data Dimensions

In [20]:
print( 'Number of Rows: {}'.format( df1.shape[0] ) )
print( 'Number of Cols: {}'.format( df1.shape[1] ) )
# Evaluate the possibilite do use this project in your computer

Number of Rows: 41188
Number of Cols: 21


## 1.3 Data Types

In [21]:
df1.dtypes

age                      int64
job                     object
marital                 object
education               object
default                 object
housing                 object
loan                    object
contact                 object
month                   object
day_of_week             object
duration                 int64
campaign                 int64
passed_days              int64
previous                 int64
previous_outcome        object
employment_var_rate    float64
consumer_price_idx     float64
consumer_conf_idx      float64
euribor_3m             float64
nr_employed            float64
y                       object
dtype: object

## 1.4 Check NA

In [22]:
df1.isna().sum()

age                    0
job                    0
marital                0
education              0
default                0
housing                0
loan                   0
contact                0
month                  0
day_of_week            0
duration               0
campaign               0
passed_days            0
previous               0
previous_outcome       0
employment_var_rate    0
consumer_price_idx     0
consumer_conf_idx      0
euribor_3m             0
nr_employed            0
y                      0
dtype: int64

In [23]:
# There are several missing values in some categorical attributes, all coded with the "unknown" label. 
# These missing values can be treated as a possible class label or using deletion or imputation techniques.

# In this first moment I will transform "unknown" to NaN
df1.replace( {'unknown': np.nan}, inplace=True )

In [24]:
df1.isna().sum()

age                       0
job                     330
marital                  80
education              1731
default                8597
housing                 990
loan                    990
contact                   0
month                     0
day_of_week               0
duration                  0
campaign                  0
passed_days               0
previous                  0
previous_outcome          0
employment_var_rate       0
consumer_price_idx        0
consumer_conf_idx         0
euribor_3m                0
nr_employed               0
y                         0
dtype: int64

## 1.5 Fillout NA

In [None]:
# job


# marital


# education


# default


# housing


# loan

