# Logistic Regression in R

We're using the dataset from https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

## Set up environment and required packages

The package *tidyverse* includes *dplyr, tidyr, readr, ggplot2*

In [None]:
options(warn=-1)

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(ROCR))

## Load the data
We have 3 data sets
1. The full bank data set with more than 41.000 entries, quite unbalanced
2. A smaller subset - still unbalanced
3. A balanced sample of the full data set with ~ 9200 entries

In [None]:
data_dir_default = "../data/"
data_sets = c("bank-full", "bank-10percent", "bank-balanced")
read_data <- function(data_set, data_dir = data_dir_default) {
  data_set <- paste(data_dir, data_set, ".csv", sep='')
  read.csv(data_set)
}

We're using the balanced data set first. Let's load and briefly explore it.

In [None]:
bank_data <- read_data(data_sets[3])
cat("# data rows: ", nrow(bank_data), "- # features: ", ncol(bank_data), "\n")
summary(bank_data)

### Some minor data manipulation
Convert the 'yes'/'no' values of the label to 1/0

In [None]:
if (any(bank_data$y == "yes")) {
bank_data <- bank_data %>%
              mutate(y = ifelse(y=="yes", 1, 0))
}

## Partition the data in training and test set
We're using the simple 80/20 split

In [None]:
set.seed(4711)
partition <- createDataPartition(bank_data$y, p = 0.8, list=FALSE)

train.df <- bank_data[partition,]
test.df  <- bank_data[-partition,]

cat("Number of training samples :", nrow(train.df), "\n")
cat("Number of test samples     :", nrow(test.df), "\n")

## Build the Model
Using the routine *glm* (generalized linear models)

In [None]:
mod.log <- glm(y ~ ., data = train.df, family=binomial)
summary(mod.log)

## Evaluate on the test set

In [None]:
thresh.pred <-.5
probs <- predict(mod.log,newdata=test.df,type="response")

test.df <- test.df %>% 
            mutate(pred.log= ifelse(probs < thresh.pred ,0,1))

### Confusion Matrix

In [None]:
with(test.df, table(y, pred.log))

### Classification Accuracy

In [None]:
accuracy = with(test.df, mean(y == pred.log))
cat("Classification Accuracy : ", format(100*accuracy,digits = 4), "%\n")

### ROC Curve

In [None]:
pred <- prediction( probs, test.df$y)
roc_perf <- performance(pred,"tpr","fpr")
plot(roc_perf, colorize=TRUE)

### Area under Curve (AUC)

In [None]:
auc_perf <- performance(pred,"auc")
auc <- auc_perf@y.values[[1]]
cat("AUC : ", format(auc,digits = 4), "\n")

## Run the Logistic Regression on all data sets

In [None]:
build_and_evaluate_model <- function(data_set) {
    data <- read_data(data_set)
    if (any(data$y == "yes")) {
        data <- data %>%
              mutate(y = ifelse(y=="yes", 1, 0))
    }
    
    set.seed(4711)
    partition <- createDataPartition(data$y, p = 0.8, list=FALSE)
    train.df <- data[partition,]
    test.df  <- data[-partition,]
    
    model <- glm(y ~ ., data = train.df, family=binomial)
    
    thresh.pred <-.5
    probs <- predict(model,newdata=test.df,type="response")

    test.df <- test.df %>% 
                mutate(pred.log= ifelse(probs < thresh.pred ,0,1))
    
    accuracy = with(test.df, mean(y == pred.log))
    pred <- prediction( probs, test.df$y)
    auc_perf <- performance(pred,"auc")
    auc <- auc_perf@y.values[[1]]
    
    cat("Data : ", 
        data_set,
        "\t - accuracy : ", format(100*accuracy,digits = 4), 
        "\t - AUC : ", auc, 
        "\n")
    flush.console()
}

for (data_set in data_sets) {
   build_and_evaluate_model(data_set)
}