Skip to content
This repository has been archived by the owner on Oct 13, 2023. It is now read-only.

Commit

Permalink
updated code from Carl
Browse files Browse the repository at this point in the history
  • Loading branch information
sdgilley committed Dec 2, 2016
1 parent a99e9a0 commit 0221e86
Show file tree
Hide file tree
Showing 17 changed files with 935 additions and 381 deletions.
Binary file modified Campaign Optimization Dashboard.pbix
Binary file not shown.
500 changes: 346 additions & 154 deletions R/Campaign Optimization R Notebook.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions R/SQL_connection.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@
## and create a New Database with the name you wish to use.
##########################################################################################################################################

connection_string <- "Driver=SQL Server; Server=.; Database=Campaign; UID=rdemo; PWD=D@tascience"
connection_string <- "Driver=SQL Server;Server=localhost;Database=Campaign;UID=rdemo;PWD=D@tascience"
sql <- RxInSqlServer(connectionString = connection_string)
local <- RxLocalSeq()
local <- RxLocalSeq()
46 changes: 33 additions & 13 deletions R/step1_data_processing.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,26 @@ source("sql_connection.R")
# Set the Compute Context to Local, to load files in-memory.
rxSetComputeContext(local)

##########################################################################################################################################

## Function to get the top n rows of a table stored on SQL Server.
## You can execute this function at any time during your progress by removing the comment "#", and inputting:
## - the table name.
## - the number of rows you want to display.

##########################################################################################################################################

display_head <- function(table_name, n_rows){
table_sql <- RxSqlServerData(sqlQuery = sprintf("SELECT TOP(%s) * FROM %s", n_rows, table_name), connectionString = connection_string)
table <- rxImport(table_sql)
print(table)
}

# table_name <- "insert_table_name"
# n_rows <- 10
# display_head(table_name, n_rows)


##########################################################################################################################################

## Read the 4 data sets from file, and upload them to SQL
Expand Down Expand Up @@ -65,8 +85,8 @@ rxExecuteSQLDDL(outOdbcDS, sSQLString = paste("DROP TABLE if exists Campaign_Pro
, sep=""))

rxExecuteSQLDDL(outOdbcDS, sSQLString = paste(
"SELECT Campaign_Detail.*, Term , No_of_people_covered,
Payment_frequency, Net_Amt_Insured, Amt_on_Maturity_Bin,
"SELECT Campaign_Detail.*, Term , No_Of_People_Covered,
Payment_Frequency, Net_Amt_Insured, Amt_On_Maturity_Bin,
Product, Premium
INTO Campaign_Product
FROM Campaign_Detail JOIN Product
Expand Down Expand Up @@ -97,9 +117,9 @@ Merged_sql <- RxSqlServerData(
CAST(No_Of_Dependents AS char(1)) AS No_Of_Dependents, Highest_Education, Ethnicity,
CAST(No_Of_Children AS char(1)) AS No_Of_Children, CAST(Household_Size AS char(1)) AS Household_Size, Gender,
Marital_Status, Channel, Time_Of_Day, Conversion_Flag, Market_Lead.Campaign_Id, Day_Of_Week, Comm_Id, Time_Stamp,
Product, Category, Term, CAST(No_of_people_covered AS char(1)) AS No_of_people_covered,
CAST(Premium AS varchar(4)) AS Premium, Payment_frequency,
Amt_on_Maturity_Bin, Sub_Category, Campaign_Drivers, Campaign_Name, Launch_Date, Call_For_Action,
Product, Category, Term, CAST(No_Of_People_Covered AS char(1)) AS No_Of_People_Covered,
CAST(Premium AS varchar(4)) AS Premium, Payment_Frequency,
Amt_On_Maturity_Bin, Sub_Category, Campaign_Drivers, Campaign_Name, Launch_Date, Call_For_Action,
Focused_Geography, Tenure_Of_Campaign, CAST(Net_Amt_Insured AS varchar(7)) AS Net_Amt_Insured , Product_Id
FROM Campaign_Product JOIN Market_Lead
ON Campaign_Product.Campaign_Id = Market_Lead.Campaign_Id "
Expand Down Expand Up @@ -132,8 +152,8 @@ for(n in var_number_with_NA ){
# Point again to the merged table without stringsAsFactors = TRUE and with correct variable types.
Merged_sql2 <- RxSqlServerData(
sqlQuery =
"SELECT Market_Lead.*, Product, Category, Term, No_of_people_covered, Premium, Payment_frequency,
Amt_on_Maturity_Bin, Sub_Category, Campaign_Drivers, Campaign_Name, Launch_Date, Call_For_Action,
"SELECT Market_Lead.*, Product, Category, Term, No_Of_People_Covered, Premium, Payment_Frequency,
Amt_On_Maturity_Bin, Sub_Category, Campaign_Drivers, Campaign_Name, Launch_Date, Call_For_Action,
Focused_Geography, Tenure_Of_Campaign, Net_Amt_Insured, Product_Id
FROM Campaign_Product JOIN Market_Lead
ON Campaign_Product.Campaign_Id = Market_Lead.Campaign_Id "
Expand All @@ -142,12 +162,12 @@ Merged_sql2 <- RxSqlServerData(
# Function to deal with NAs.
Mode_Replace <- function(data) {
data <- data.frame(data)
for(j in 1:length(var_with_NA)){
row_na <- which(is.na(data[,var_with_NA[j]]) == TRUE)
if (var_with_NA[j] %in% c("No_Of_Dependents", "No_Of_Children", "Household_Size", "No_of_people_covered", "Premium", "Net_Amt_Insured")){
data[row_na,var_with_NA[j]] <- as.integer(mode[j])
for(j in 1:length(var_with_NA_1)){
row_na <- which(is.na(data[,var_with_NA_1[j]]) == TRUE)
if (var_with_NA_1[j] %in% c("No_Of_Dependents", "No_Of_Children", "Household_Size", "No_Of_People_Covered", "Premium", "Net_Amt_Insured")){
data[row_na,var_with_NA_1[j]] <- as.integer(mode_1[j])
} else{
data[row_na,var_with_NA[j]] <- mode[j]
data[row_na,var_with_NA_1[j]] <- mode_1[j]
}
}
return(data)
Expand All @@ -156,7 +176,7 @@ Mode_Replace <- function(data) {
# Create the CM_AD0 table by dealing with NAs in Merged_sql and save it to a SQL table.
CM_AD0 <- RxSqlServerData(table = "CM_AD0", connectionString = connection_string)
rxDataStep(inData = Merged_sql2 , outFile = CM_AD0, overwrite = TRUE, transformFunc = Mode_Replace,
transformObjects = list(var_with_NA = var_with_NA, mode = mode))
transformObjects = list(var_with_NA_1 = var_with_NA, mode_1 = mode))

# Drop intermediate tables.
rxSqlServerDropTable(table = "Campaign_Product", connectionString = connection_string)
Expand Down
26 changes: 23 additions & 3 deletions R/step2_feature_engineering.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,26 @@ source("sql_connection.R")
rxSetComputeContext(sql)


##########################################################################################################################################

## Function to get the top n rows of a table stored on SQL Server.
## You can execute this function at any time during your progress by removing the comment "#", and inputting:
## - the table name.
## - the number of rows you want to display.

##########################################################################################################################################

display_head <- function(table_name, n_rows){
table_sql <- RxSqlServerData(sqlQuery = sprintf("SELECT TOP(%s) * FROM %s", n_rows, table_name), connectionString = connection_string)
table <- rxImport(table_sql)
print(table)
}

# table_name <- "insert_table_name"
# n_rows <- 10
# display_head(table_name, n_rows)


##########################################################################################################################################

## Input: Point to the SQL table with the cleaned raw data set
Expand Down Expand Up @@ -118,10 +138,10 @@ rxExecuteSQLDDL(outOdbcDS, sSQLString = paste("CREATE TABLE CM_AD
,Product varchar(50)
,Category varchar(15)
,Term char(2)
,No_of_people_covered int
,No_Of_People_Covered int
,Premium int
,Payment_frequency varchar(50)
,Amt_on_Maturity_Bin varchar(50)
,Payment_Frequency varchar(50)
,Amt_On_Maturity_Bin varchar(50)
,Sub_Category varchar(15)
,Campaign_Drivers varchar(50)
,Campaign_Name varchar(50)
Expand Down
64 changes: 45 additions & 19 deletions R/step3_training_evaluation.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,26 @@ source("sql_connection.R")
rxSetComputeContext(local)


##########################################################################################################################################

## Function to get the top n rows of a table stored on SQL Server.
## You can execute this function at any time during your progress by removing the comment "#", and inputting:
## - the table name.
## - the number of rows you want to display.

##########################################################################################################################################

display_head <- function(table_name, n_rows){
table_sql <- RxSqlServerData(sqlQuery = sprintf("SELECT TOP(%s) * FROM %s", n_rows, table_name), connectionString = connection_string)
table <- rxImport(table_sql)
print(table)
}

# table_name <- "insert_table_name"
# n_rows <- 10
# display_head(table_name, n_rows)


##########################################################################################################################################

## Input: Point to the SQL table with the data set for modeling
Expand All @@ -48,31 +68,37 @@ column_info <- rxCreateColInfo(CM_AD)

##########################################################################################################################################

# Write a splitting function to randomly split the data into a training set and a testing set, with proportion p.
# It creates a random variable, Split_Vector. For each unique Lead_Id, it is equal to 1 with proportion p, and 0 otherwise.
# When p = 1, the observtion goes to the training set. When p = 0, it goes to the testing set.
# Randomly split the data into a training set and a testing set, with a splitting % p.
# p % goes to the training set, and the rest goes to the testing set. Default is 70%.

Splitting <- function(p = 0.70){
CM_AD1 <- RxSqlServerData(table = "CM_AD1", stringsAsFactors = T, connectionString = connection_string)
rxDataStep(inData = CM_AD, outFile = CM_AD1, overwrite = TRUE,transforms = list(
Split_Vector = rbinom(.rxNumRows, 1, p)),
transformObjects = list(p = p))
}
p <- "70"

## Open a connection with SQL Server to be able to write queries with the rxExecuteSQLDDL function.
outOdbcDS <- RxOdbcData(table = "NewData", connectionString = connection_string, useFastRead=TRUE)
rxOpen(outOdbcDS, "w")

## Create the Train_Id table containing Lead_Id of training set.
rxExecuteSQLDDL(outOdbcDS, sSQLString = paste("DROP TABLE if exists Train_Id;", sep=""))

Splitting(p = 0.7)
rxExecuteSQLDDL(outOdbcDS, sSQLString = sprintf(
"SELECT Lead_Id
INTO Train_Id
FROM CM_AD
WHERE ABS(CAST(BINARY_CHECKSUM(Lead_ID, NEWID()) as int)) %s < %s ;"
,"% 100", p ))

# Point to the training set. It will be created on the fly when training models.
## Point to the training set. It will be created on the fly when training models.
CM_AD_Train <- RxSqlServerData(
sqlQuery = "SELECT *
FROM CM_AD1
WHERE Split_Vector = 1",
FROM CM_AD
WHERE Lead_Id IN (SELECT Lead_Id from Train_Id)",
connectionString = connection_string, colInfo = column_info)

# Point to the testing set. It will be created on the fly when testing models.
## Point to the testing set. It will be created on the fly when testing models.
CM_AD_Test <- RxSqlServerData(
sqlQuery = "SELECT *
FROM CM_AD1
WHERE Split_Vector = 0",
FROM CM_AD
WHERE Lead_Id NOT IN (SELECT Lead_Id from Train_Id)",
connectionString = connection_string, colInfo = column_info)


Expand All @@ -85,7 +111,7 @@ CM_AD_Test <- RxSqlServerData(
# Write the formula after removing variables not used in the modeling.
variables_all <- rxGetVarNames(CM_AD_Train)
variables_to_remove <- c("Lead_Id", "Phone_No", "Country", "Comm_Id", "Time_Stamp", "Category", "Launch_Date", "Focused_Geography",
"Split_Vector", "Call_For_Action", "Product", "Campaign_Name")
"Call_For_Action", "Product", "Campaign_Name")
traning_variables <- variables_all[!(variables_all %in% c("Conversion_Flag", variables_to_remove))]
formula <- as.formula(paste("Conversion_Flag ~", paste(traning_variables, collapse = "+")))

Expand Down Expand Up @@ -200,7 +226,7 @@ evaluate_model <- function(observed, predicted_probability, threshold, model_nam
##########################################################################################################################################

# Make Predictions, then import them into R. The observed Conversion_Flag is kept through the argument extraVarsToWrite.
Prediction_Table_RF <- RxSqlServerData(table = "Prediction_Table_RF", stringsAsFactors = T, connectionString = connection_string)
Prediction_Table_RF <- RxSqlServerData(table = "Forest_Prediction", stringsAsFactors = T, connectionString = connection_string)
rxPredict(forest_model, data = CM_AD_Test, outData = Prediction_Table_RF, overwrite = T, type = "prob",
extraVarsToWrite = c("Conversion_Flag"))

Expand All @@ -222,7 +248,7 @@ Metrics_RF <- evaluate_model(observed = observed, predicted_probability = Predic
##########################################################################################################################################

# Make Predictions, then import them into R. The observed Conversion_Flag is kept through the argument extraVarsToWrite.
Prediction_Table_GBT <- RxSqlServerData(table = "Prediction_Table_GBT", stringsAsFactors = T, connectionString = connection_string)
Prediction_Table_GBT <- RxSqlServerData(table = "Boosted_Prediction", stringsAsFactors = T, connectionString = connection_string)
rxPredict(btree_model,data = CM_AD_Test, outData = Prediction_Table_GBT, overwrite = T, type="prob",
extraVarsToWrite = c("Conversion_Flag"))

Expand Down
58 changes: 41 additions & 17 deletions R/step4_campaign_recommendations.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,26 @@ source("sql_connection.R")
rxSetComputeContext(local)


##########################################################################################################################################

## Function to get the top n rows of a table stored on SQL Server.
## You can execute this function at any time during your progress by removing the comment "#", and inputting:
## - the table name.
## - the number of rows you want to display.

##########################################################################################################################################

display_head <- function(table_name, n_rows){
table_sql <- RxSqlServerData(sqlQuery = sprintf("SELECT TOP(%s) * FROM %s", n_rows, table_name), connectionString = connection_string)
table <- rxImport(table_sql)
print(table)
}

# table_name <- "insert_table_name"
# n_rows <- 10
# display_head(table_name, n_rows)


##########################################################################################################################################

## Input: - Point to the SQL table with the whole data set
Expand Down Expand Up @@ -57,6 +77,7 @@ if(best == "GBT"){

## Create a full data table with all the unique combinations of Day_of_Week, Channel, Time_Of_Day


##########################################################################################################################################

# Create a table with all the unique combinations of Day_of_Week, Channel, Time_Of_Day.
Expand All @@ -67,24 +88,27 @@ Unique_Combos <- merge(merge(Day_of_Week_unique, Channel_unique), Time_Of_Day_un
colnames(Unique_Combos) <- c("Day_Of_Week", "Channel", "Time_Of_Day")

# Export it to SQL
Unique_Combos_sql <- RxSqlServerData(table = "Unique_Combos_sql", connectionString = connection_string)
Unique_Combos_sql <- RxSqlServerData(table = "Unique_Combos", connectionString = connection_string)
rxDataStep(inData = Unique_Combos, outFile = Unique_Combos_sql, overwrite = T)


# We create a table that has, for each Lead_Id and its corresponding variables (except Day_of_Week, Channel, Time_Of_Day),
# One row for each possible combination of Day_of_Week, Channel and Time_Of_Day.
# This is a pointer. The table will be created on the fly while scoring.
# This is a pointer. The table will be created on the fly while scoring.

# For a faster implementation, we are selecting only the top 10K customers.
# For a full solution, you can remove TOP(10000) from the query below.

AD_full_merged_sql <- RxSqlServerData(
sqlQuery = "SELECT *
FROM (
SELECT Lead_Id, Age, Annual_Income_Bucket, Credit_Score, State, No_Of_Dependents, Highest_Education, Ethnicity,
No_Of_Children, Household_Size, Gender, Marital_Status, Campaign_Id, Product_Id, Term,
No_of_people_covered, Premium, Payment_frequency, Amt_on_Maturity_Bin, Sub_Category, Campaign_Drivers,
Tenure_Of_Campaign, Net_Amt_Insured, SMS_Count, Email_Count, Call_Count,
Previous_Channel, Conversion_Flag
SELECT TOP(10000) Lead_Id, Age, Annual_Income_Bucket, Credit_Score, State, No_Of_Dependents, Highest_Education,
Ethnicity, No_Of_Children, Household_Size, Gender, Marital_Status, Campaign_Id, Product_Id, Term,
No_Of_People_Covered, Premium, Payment_Frequency, Amt_On_Maturity_Bin, Sub_Category, Campaign_Drivers,
Tenure_Of_Campaign, Net_Amt_Insured, SMS_Count, Email_Count, Call_Count,
Previous_Channel, Conversion_Flag
FROM CM_AD) a,
(SELECT * FROM Unique_Combos_sql) b",
(SELECT * FROM Unique_Combos) b",
stringsAsFactors = T, connectionString = connection_string, colInfo = column_info)


Expand Down Expand Up @@ -115,16 +139,16 @@ rxExecuteSQLDDL(outOdbcDS, sSQLString = paste("DROP TABLE if exists Recommended_
, sep=""))

rxExecuteSQLDDL(outOdbcDS, sSQLString = paste(
"SELECT Lead_Id, Day_of_Week, Channel, Time_Of_Day, MaxProb
"SELECT Lead_Id, Day_of_Week, Channel, Time_Of_Day, Max_Prob
INTO Recommended_Combinations
FROM (
SELECT maxp.Lead_Id, Day_of_Week, Channel, Time_Of_Day, MaxProb,
SELECT maxp.Lead_Id, Day_of_Week, Channel, Time_Of_Day, Max_Prob,
ROW_NUMBER() OVER (partition by maxp.Lead_Id ORDER BY NEWID()) as RowNo
FROM ( SELECT Lead_Id, max([1_prob]) as MaxProb
FROM ( SELECT Lead_Id, max([1_prob]) as Max_Prob
FROM Prob_Id
GROUP BY Lead_Id) maxp
JOIN Prob_Id
ON (maxp.Lead_Id = Prob_Id.Lead_Id AND maxp.MaxProb = Prob_Id.[1_prob])
ON (maxp.Lead_Id = Prob_Id.Lead_Id AND maxp.Max_Prob = Prob_Id.[1_prob])
) candidates
WHERE RowNo = 1;"
, sep=""))
Expand All @@ -140,11 +164,11 @@ rxExecuteSQLDDL(outOdbcDS, sSQLString = paste("DROP TABLE if exists Recommendati
, sep=""))

rxExecuteSQLDDL(outOdbcDS, sSQLString = paste("
SELECT Age, Annual_Income_Bucket, Credit_Score, Product, Campaign_Name as [Campaign Name], State,
Conversion_Flag as Converts, CM_AD.Day_Of_Week as [Day of Week], CM_AD.Time_Of_Day as [Time of Day],
CM_AD.Channel, CM_AD.Lead_Id as [Lead ID], Recommended_Combinations.Day_Of_Week as [Recommended Day],
Recommended_Combinations.Time_Of_Day as [Recommended Time], Recommended_Combinations.MaxProb,
Recommended_Combinations.Channel as [Recommended Channel]
SELECT Age, Annual_Income_Bucket, Credit_Score, Product, Campaign_Name, State,
Conversion_Flag, CM_AD.Day_Of_Week, CM_AD.Time_Of_Day,
CM_AD.Channel, CM_AD.Lead_Id, Recommended_Combinations.Day_Of_Week as [Recommended_Day],
Recommended_Combinations.Time_Of_Day as [Recommended_Time], Recommended_Combinations.Max_Prob,
Recommended_Combinations.Channel as [Recommended_Channel]
INTO Recommendations
FROM CM_AD JOIN Recommended_Combinations
ON CM_AD.Lead_Id = Recommended_Combinations.Lead_Id;"
Expand Down

0 comments on commit 0221e86

Please sign in to comment.