# Classification Methods in R

We're using the dataset from https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

## Set up environment and required packages

The package *tidyverse* includes *dplyr, tidyr, readr, ggplot2*

In [None]:
options(warn=-1)

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(ROCR))

## Load the data
We have 3 data sets
1. The full bank data set with more than 41.000 entries, quite unbalanced
2. A smaller subset - still unbalanced
3. A balanced sample of the full data set with ~ 9200 entries

In [None]:
data_dir <- "../data/"
data_sets = c("bank-full.csv", "bank-10percent.csv", "bank-balanced.csv")
data_set <- paste(data_dir,data_sets[3], sep='')
bank_data <- read.csv(data_set)

Let's briefly explore it

In [None]:
cat("# data rows: ", nrow(bank_data), "- # features: ", ncol(bank_data), "\n")
summary(bank_data)

### Some minor data manipulation
Convert the 'yes'/'no' values of the label to 1/0

In [None]:
if (any(bank_data$y == "yes")) {
bank_data <- bank_data %>%
              mutate(y = ifelse(y=="yes", 1, 0))
}

## Partition the data in training and test set
We're using the simple 80/20 split

In [None]:
set.seed(4711)
partition <- createDataPartition(bank_data$y, p = 0.8, list=FALSE)

train.df <- bank_data[partition,]
test.df  <- bank_data[-partition,]

cat("Number of training samples :", nrow(train.df), "\n")
cat("Number of test samples     :", nrow(test.df), "\n")

## Build the Model
Using the routine *glm* (generalized linear models)

In [None]:
mod.log <- glm(y ~ ., data = train.df, family=binomial)
summary(mod.log)

## Evaluate on the test set

In [None]:
thresh.pred <-.5
probs <- predict(mod.log,newdata=test.df,type="response")

test.df <- test.df %>% 
            mutate(pred.log= ifelse(probs < thresh.pred ,0,1))

### Confusion Matrix

In [None]:
with(test.df, table(y, pred.log))

### Classification Accuracy

In [None]:
accuracy = with(test.df, mean(y == pred.log))
cat("Classification Accuracy : ", format(100*accuracy,digits = 4), "%\n")

### ROC Curve

In [None]:
pred <- prediction( probs, test.df$y)
roc_perf <- performance(pred,"tpr","fpr")
plot(roc_perf, colorize=TRUE)

### Area under Curve (AUC)

In [None]:
auc_perf <- performance(pred,"auc")
auc <- auc_perf@y.values[[1]]
cat("AUC : ", format(auc,digits = 4), "\n")