### Importing the data 

In [1]:
import pandas as pd
bank_additional_full = pd.read_csv("bank-additional-full.csv", delimiter = ';')
bank_additional_full

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


### Replace the unkwown label with NA and see the total missing values of each colulmn

In [2]:
import numpy as np
df = bank_additional_full 
df.replace('unknown', np.nan, inplace = True)
df.isnull().sum()

age                  0
job                330
marital             80
education         1731
default           8597
housing            990
loan               990
contact              0
month                0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64

### View the labels of each column that contains missing values

In [3]:
df['housing'].value_counts()

yes    21576
no     18622
Name: housing, dtype: int64

In [4]:
df['loan'].value_counts()

no     33950
yes     6248
Name: loan, dtype: int64

In [5]:
df['job'].value_counts()

admin.           10422
blue-collar       9254
technician        6743
services          3969
management        2924
retired           1720
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
Name: job, dtype: int64

In [6]:
df['marital'].value_counts()

married     24928
single      11568
divorced     4612
Name: marital, dtype: int64

In [7]:
df['education'].value_counts()

university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
illiterate                18
Name: education, dtype: int64

In [8]:
df['default'].value_counts()

no     32588
yes        3
Name: default, dtype: int64

### Fill the missing values with the appropriate method in each case

In [9]:
df['loan'].fillna(df['loan'].value_counts().index[0], inplace = True)

In [10]:
df['marital'].fillna(df['marital'].value_counts().index[0], inplace = True)

In [11]:
df['default'].fillna(df['default'].value_counts().index[0], inplace = True)

In [12]:
def na_randomfill(function):
    na = pd.isnull(function)   
    number_null = na.sum()        
    if number_null == 0:
        return function             
    fill_values = function[~na].sample(n = number_null, replace = True, random_state = 0)
    fill_values.index = function.index[na]
    return function.fillna(fill_values) 

In [13]:
df = df.apply(na_randomfill)
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

### Finally, we have zero missing values!