# Problem Statement

A Kenyan entrepreneur has created an online cryptography course and would want to advertise it on her blog. She currently targets audiences originating from various countries. In the past, she ran ads to advertise a related course on the same blog and collected data in the process. The project below presents a solution that would allow her to determine whether ads targeted to audiences of certain characteristics i.e. city, male country, ad topic, etc. would click on her ads. 

The solution involves creating a prediction model that will accurately predict whether a user will click an Ad.


# Metrics of Success

# Experimental Design

# Loading the Dataset

In [10]:
#code to suppress warnings
#setting the option warn 
options(warn=-1)

In [11]:
#installing the packages
# List of packages
package_list <- c("tidyverse", "lubridate", "readxl", "tidyr", "Hmisc", "skimr",
                  "ggcorrplot","caret", "caretEnsemble", "PerformanceAnalytics",
                  "kableExtra", "kernlab", "randomForest", "xgboost")

# Load/Install packages
suppressMessages(pacman::p_load(package_list, character.only = TRUE))

In [14]:
#install tibble to be able to convert the dataframe into tibble
install.packages("tibble")

In [105]:
#loading the dataset and coverting it into Tibble
adv <- read.csv("advertising.csv", stringsAsFactors=FALSE)

head(adv)


Daily.Time.Spent.on.Site,Age,Area.Income,Daily.Internet.Usage,Ad.Topic.Line,City,Male,Country,Timestamp,Clicked.on.Ad
68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0
59.99,23,59761.56,226.74,Sharable client-driven software,Jamieberg,1,Norway,2016-05-19 14:30:17,0


In [None]:
#converting the adv dataframe to a tibble
as_tibble(adv)

In [30]:
#previewing the number of rows in the dataset
nrow(adv)

In [21]:
#previewing the number of columns in the dataset
ncol(adv)

# Data Cleaning and Exploration

In [107]:
#checking the column names
colnames(adv)

In [108]:
#converting the column names into lower case and stripping the '.' to ensure uniformity

colnames(adv) = tolower(str_replace_all(colnames(adv), c('[.]' = '_')))


In [109]:
#checking the column names have been changed
colnames(adv)

In [110]:
# Exploring the variables in the dataset using the glimpse function
glimpse(adv)

Observations: 1,000
Variables: 10
$ daily_time_spent_on_site <dbl> 68.95, 80.23, 69.47, 74.15, 68.37, 59.99, ...
$ age                      <int> 35, 31, 26, 29, 35, 23, 33, 48, 30, 20, 49...
$ area_income              <dbl> 61833.90, 68441.85, 59785.94, 54806.18, 73...
$ daily_internet_usage     <dbl> 256.09, 193.77, 236.50, 245.89, 225.58, 22...
$ ad_topic_line            <chr> "Cloned 5thgeneration orchestration", "Mon...
$ city                     <chr> "Wrightburgh", "West Jodi", "Davidton", "W...
$ male                     <int> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, ...
$ country                  <chr> "Tunisia", "Nauru", "San Marino", "Italy",...
$ timestamp                <chr> "2016-03-27 00:53:11", "2016-04-04 01:39:0...
$ clicked_on_ad            <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, ...


In [111]:
#converting the columns in factor datatype to character datatype i.e column 5,6 and 8

cols_to_change = c(5, 6, 8)
for(i in cols_to_change){
   class(adv[, i]) = "character"
}

In [116]:
glimpse(adv)

Observations: 1,000
Variables: 11
$ daily_time_spent_on_site <dbl> 68.95, 80.23, 69.47, 74.15, 68.37, 59.99, ...
$ age                      <int> 35, 31, 26, 29, 35, 23, 33, 48, 30, 20, 49...
$ area_income              <dbl> 61833.90, 68441.85, 59785.94, 54806.18, 73...
$ daily_internet_usage     <dbl> 256.09, 193.77, 236.50, 245.89, 225.58, 22...
$ ad_topic_line            <chr> "Cloned 5thgeneration orchestration", "Mon...
$ city                     <chr> "Wrightburgh", "West Jodi", "Davidton", "W...
$ male                     <int> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, ...
$ country                  <chr> "Tunisia", "Nauru", "San Marino", "Italy",...
$ timestamp                <chr> "2016-03-27 00:53:11", "2016-04-04 01:39:0...
$ clicked_on_ad            <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, ...
$ Date                     <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...


In [113]:
head(adv,10)

daily_time_spent_on_site,age,area_income,daily_internet_usage,ad_topic_line,city,male,country,timestamp,clicked_on_ad
68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0
59.99,23,59761.56,226.74,Sharable client-driven software,Jamieberg,1,Norway,2016-05-19 14:30:17,0
88.91,33,53852.85,208.36,Enhanced dedicated support,Brandonstad,0,Myanmar,2016-01-28 20:59:32,0
66.0,48,24593.33,131.76,Reactive local challenge,Port Jefferybury,1,Australia,2016-03-07 01:40:15,1
74.53,30,68862.0,221.51,Configurable coherent function,West Colin,1,Grenada,2016-04-18 09:33:42,0
69.88,20,55642.32,183.82,Mandatory homogeneous architecture,Ramirezton,1,Ghana,2016-07-11 01:42:51,0


In [118]:


# adv$time <- format(as.POSIXct(strptime(adv$timestamp,format="%Y:%m:%d %H:%M:%S"),"%H:%M:%S"))


date <- format(as.POSIXct(strptime(adv$timestamp,"%Y:%m:%d %H:%M:%S",tz="")) ,format = "%Y:%m:%d")

time <- format(as.POSIXct(strptime(adv$timestamp,"%Y:%m:%d %H:%M:%S",tz="")) ,format = "%H:%M:%S") 


adv$date <- date
    adv$time <- time


In [119]:
head(adv,10)

daily_time_spent_on_site,age,area_income,daily_internet_usage,ad_topic_line,city,male,country,timestamp,clicked_on_ad,Date,date,time
68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0,,,
80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0,,,
69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0,,,
74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0,,,
68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0,,,
59.99,23,59761.56,226.74,Sharable client-driven software,Jamieberg,1,Norway,2016-05-19 14:30:17,0,,,
88.91,33,53852.85,208.36,Enhanced dedicated support,Brandonstad,0,Myanmar,2016-01-28 20:59:32,0,,,
66.0,48,24593.33,131.76,Reactive local challenge,Port Jefferybury,1,Australia,2016-03-07 01:40:15,1,,,
74.53,30,68862.0,221.51,Configurable coherent function,West Colin,1,Grenada,2016-04-18 09:33:42,0,,,
69.88,20,55642.32,183.82,Mandatory homogeneous architecture,Ramirezton,1,Ghana,2016-07-11 01:42:51,0,,,


Observations: 1,000
Variables: 10
$ daily_time_spent_on_site <dbl> 68.95, 80.23, 69.47, 74.15, 68.37, 59.99, ...
$ age                      <int> 35, 31, 26, 29, 35, 23, 33, 48, 30, 20, 49...
$ area_income              <dbl> 61833.90, 68441.85, 59785.94, 54806.18, 73...
$ daily_internet_usage     <dbl> 256.09, 193.77, 236.50, 245.89, 225.58, 22...
$ ad_topic_line            <chr> "92", "465", "567", "904", "767", "806", "...
$ city                     <chr> "962", "904", "112", "940", "806", "283", ...
$ male                     <int> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, ...
$ country                  <chr> "216", "148", "185", "104", "97", "159", "...
$ timestamp                <fct> 2016-03-27 00:53:11, 2016-04-04 01:39:02, ...
$ clicked_on_ad            <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, ...
