## Import required libraries

In [None]:
options(tidyverse.quiet = TRUE) 
options(warn=-1)

# tidyverse includes dplyr, tidyr, readr, ggplot2
library(tidyverse)
library(gmodels)
library(ggmosaic)

## A few graphics settings

In [None]:
## default theme for ggplot
theme_set(theme_bw())

## setting default parameters for mosaic plots
mosaic_theme = theme(axis.text.x = element_text(angle = 90,
                                                hjust = 1,
                                                vjust = 0.5),
                     axis.text.y = element_blank(),
                     axis.ticks.y = element_blank())

## Load the data

In [None]:
data_dir <- "../data/bank-full.csv"
bank_data <- read.csv(data_dir)

## Textual exploration

In [None]:
sprintf("# rows: %d - # columns: %d", nrow(bank_data), ncol(bank_data))
head(bank_data,10)

In [None]:
summary(bank_data)

Let's check the proportion of 'yes/no' entries

In [None]:
CrossTable(bank_data$y) # from gmodels

We see that the data is highly unbalanced. Almost 89% of the entries have value 'no'.

Let's check how many unknown values we have for the different attributes.

In [None]:
bank_data %>% 
  summarise_all(list(~sum(. == "unknown"))) %>%
  gather(key = "variable", value = "nr_unknown") %>% 
  arrange(-nr_unknown)

## Visualization

### A simple histogram of the `age` distribution

In [None]:
hist(bank_data$age, main="Age Distribution of Customers", xlab="age")

### Distribution of `age` for the different classification groups (i.e. `y` column)

In [None]:
# show histograms for the 'age' feature for the different values of the 'y' column
bank_data %>% 
  ggplot() +
  aes(x = age) +
  geom_bar() +
  facet_grid(y ~ .,
             scales = "free_y") +
  scale_x_continuous(breaks = seq(0, 100, 5))

It seems like the probability of success is lower for customers in the age range between 40 and 60 but people older than 60 seem to be more likely to respond positively.

### Distribution of the categorial feature `job` for the different classification groups 

In [None]:
bank_data %>% 
  ggplot() +
  geom_mosaic(aes(x = product(y, job), fill = y)) +
  mosaic_theme +
  theme(aspect.ratio = 0.7) +
  xlab("Job") +
  ylab(NULL)

Success rates are higher for the group of 'retired' people and the - quite small - group of students.

### Monthly Distribution
Let's see whether how the number of contacts and the success rates vary over the months

In [None]:
month_table <- table(bank_data$month, bank_data$y)
month_tab <- as.data.frame(prop.table(month_table, 2))
colnames(month_tab) <-  c("month", "y", "perc")

ggplot(data = month_tab, aes(x = month, y = perc, fill = y)) + 
  geom_bar(stat = 'identity', position = 'dodge', alpha = 2/3) +
  theme(aspect.ratio = 0.7) +
  xlab("Month") +
  ylab("Percent")

Obviously, most of the contacts were made in the middle of the year. The success rates were higher towards the end of the year, though. 
That's an observation that might be discussed with the customer service representative. 