updated code from Carl

microsoft · Dec 2, 2016 · 0221e86 · 0221e86
1 parent a99e9a0
commit 0221e86
Show file tree

Hide file tree

Showing 17 changed files with 935 additions and 381 deletions.
diff --git a/Campaign Optimization Dashboard.pbix b/Campaign Optimization Dashboard.pbix
diff --git a/R/Campaign Optimization R Notebook.ipynb b/R/Campaign Optimization R Notebook.ipynb
diff --git a/R/SQL_connection.R b/R/SQL_connection.R
@@ -9,6 +9,6 @@
 ##       and create a New Database with the name you wish to use.
 ##########################################################################################################################################
 
-connection_string <- "Driver=SQL Server; Server=.; Database=Campaign; UID=rdemo; PWD=D@tascience"
+connection_string <- "Driver=SQL Server;Server=localhost;Database=Campaign;UID=rdemo;PWD=D@tascience"
 sql <- RxInSqlServer(connectionString = connection_string)
-local <- RxLocalSeq()
+local <- RxLocalSeq()
diff --git a/R/step1_data_processing.R b/R/step1_data_processing.R
@@ -22,6 +22,26 @@ source("sql_connection.R")
 # Set the Compute Context to Local, to load files in-memory.
 rxSetComputeContext(local)
 
+##########################################################################################################################################
+
+## Function to get the top n rows of a table stored on SQL Server.
+## You can execute this function at any time during  your progress by removing the comment "#", and inputting:
+##  - the table name.
+##  - the number of rows you want to display.
+
+##########################################################################################################################################
+
+display_head <- function(table_name, n_rows){
+  table_sql <- RxSqlServerData(sqlQuery = sprintf("SELECT TOP(%s) * FROM %s", n_rows, table_name), connectionString = connection_string)
+  table <- rxImport(table_sql)
+  print(table)
+}
+
+# table_name <- "insert_table_name"
+# n_rows <- 10
+# display_head(table_name, n_rows)
+
+
 ##########################################################################################################################################
 
 ## Read the 4 data sets from file, and upload them to SQL
@@ -65,8 +85,8 @@ rxExecuteSQLDDL(outOdbcDS, sSQLString = paste("DROP TABLE if exists Campaign_Pro
 , sep=""))
 
 rxExecuteSQLDDL(outOdbcDS, sSQLString = paste(
-  "SELECT Campaign_Detail.*, Term , No_of_people_covered, 
-  Payment_frequency, Net_Amt_Insured, Amt_on_Maturity_Bin,
+  "SELECT Campaign_Detail.*, Term , No_Of_People_Covered, 
+  Payment_Frequency, Net_Amt_Insured, Amt_On_Maturity_Bin,
   Product, Premium
   INTO Campaign_Product
   FROM Campaign_Detail JOIN Product
@@ -97,9 +117,9 @@ Merged_sql <- RxSqlServerData(
         CAST(No_Of_Dependents AS char(1)) AS No_Of_Dependents, Highest_Education, Ethnicity,
         CAST(No_Of_Children AS char(1)) AS No_Of_Children, CAST(Household_Size AS char(1)) AS Household_Size, Gender, 
         Marital_Status, Channel, Time_Of_Day, Conversion_Flag, Market_Lead.Campaign_Id, Day_Of_Week, Comm_Id, Time_Stamp,
-        Product, Category, Term, CAST(No_of_people_covered AS char(1)) AS No_of_people_covered,
-        CAST(Premium AS varchar(4)) AS Premium, Payment_frequency,
-        Amt_on_Maturity_Bin, Sub_Category, Campaign_Drivers, Campaign_Name, Launch_Date, Call_For_Action, 
+        Product, Category, Term, CAST(No_Of_People_Covered AS char(1)) AS No_Of_People_Covered,
+        CAST(Premium AS varchar(4)) AS Premium, Payment_Frequency,
+        Amt_On_Maturity_Bin, Sub_Category, Campaign_Drivers, Campaign_Name, Launch_Date, Call_For_Action, 
         Focused_Geography, Tenure_Of_Campaign, CAST(Net_Amt_Insured AS varchar(7)) AS Net_Amt_Insured , Product_Id
  FROM Campaign_Product JOIN Market_Lead 
  ON Campaign_Product.Campaign_Id = Market_Lead.Campaign_Id "
@@ -132,8 +152,8 @@ for(n in var_number_with_NA ){
 # Point again to the merged table without stringsAsFactors = TRUE and with correct variable types. 
 Merged_sql2 <- RxSqlServerData(  
   sqlQuery = 
-"SELECT Market_Lead.*, Product, Category, Term, No_of_people_covered, Premium, Payment_frequency,
-        Amt_on_Maturity_Bin, Sub_Category, Campaign_Drivers, Campaign_Name, Launch_Date, Call_For_Action, 
+"SELECT Market_Lead.*, Product, Category, Term, No_Of_People_Covered, Premium, Payment_Frequency,
+        Amt_On_Maturity_Bin, Sub_Category, Campaign_Drivers, Campaign_Name, Launch_Date, Call_For_Action, 
         Focused_Geography, Tenure_Of_Campaign, Net_Amt_Insured, Product_Id
  FROM Campaign_Product JOIN Market_Lead
  ON Campaign_Product.Campaign_Id = Market_Lead.Campaign_Id "
@@ -142,12 +162,12 @@ Merged_sql2 <- RxSqlServerData(
 # Function to deal with NAs. 
 Mode_Replace <- function(data) {
   data <- data.frame(data)
-  for(j in 1:length(var_with_NA)){
-    row_na <- which(is.na(data[,var_with_NA[j]]) == TRUE) 
-        if (var_with_NA[j] %in% c("No_Of_Dependents", "No_Of_Children", "Household_Size", "No_of_people_covered", "Premium", "Net_Amt_Insured")){
-          data[row_na,var_with_NA[j]] <- as.integer(mode[j])
+  for(j in 1:length(var_with_NA_1)){
+    row_na <- which(is.na(data[,var_with_NA_1[j]]) == TRUE) 
+        if (var_with_NA_1[j] %in% c("No_Of_Dependents", "No_Of_Children", "Household_Size", "No_Of_People_Covered", "Premium", "Net_Amt_Insured")){
+          data[row_na,var_with_NA_1[j]] <- as.integer(mode_1[j])
         } else{
-          data[row_na,var_with_NA[j]] <- mode[j]
+          data[row_na,var_with_NA_1[j]] <- mode_1[j]
         }
   }
   return(data)
@@ -156,7 +176,7 @@ Mode_Replace <- function(data) {
 # Create the CM_AD0 table by dealing with NAs in Merged_sql and save it to a SQL table.
 CM_AD0 <- RxSqlServerData(table = "CM_AD0", connectionString = connection_string)
 rxDataStep(inData = Merged_sql2 , outFile = CM_AD0, overwrite = TRUE, transformFunc = Mode_Replace, 
-           transformObjects = list(var_with_NA = var_with_NA, mode = mode))
+           transformObjects = list(var_with_NA_1 = var_with_NA, mode_1 = mode))
 
 # Drop intermediate tables.
 rxSqlServerDropTable(table = "Campaign_Product", connectionString = connection_string)

diff --git a/R/step2_feature_engineering.R b/R/step2_feature_engineering.R
@@ -23,6 +23,26 @@ source("sql_connection.R")
 rxSetComputeContext(sql)
 
 
+##########################################################################################################################################
+
+## Function to get the top n rows of a table stored on SQL Server.
+## You can execute this function at any time during  your progress by removing the comment "#", and inputting:
+##  - the table name.
+##  - the number of rows you want to display.
+
+##########################################################################################################################################
+
+display_head <- function(table_name, n_rows){
+  table_sql <- RxSqlServerData(sqlQuery = sprintf("SELECT TOP(%s) * FROM %s", n_rows, table_name), connectionString = connection_string)
+  table <- rxImport(table_sql)
+  print(table)
+}
+
+# table_name <- "insert_table_name"
+# n_rows <- 10
+# display_head(table_name, n_rows)
+
+
 ##########################################################################################################################################
 
 ## Input: Point to the SQL table with the cleaned raw data set
@@ -118,10 +138,10 @@ rxExecuteSQLDDL(outOdbcDS, sSQLString = paste("CREATE TABLE CM_AD
   ,Product varchar(50)
   ,Category varchar(15)
   ,Term char(2)
-  ,No_of_people_covered int
+  ,No_Of_People_Covered int
   ,Premium int 
-  ,Payment_frequency varchar(50)
-  ,Amt_on_Maturity_Bin varchar(50)
+  ,Payment_Frequency varchar(50)
+  ,Amt_On_Maturity_Bin varchar(50)
   ,Sub_Category varchar(15)
   ,Campaign_Drivers varchar(50)
   ,Campaign_Name varchar(50)

diff --git a/R/step3_training_evaluation.R b/R/step3_training_evaluation.R
@@ -24,6 +24,26 @@ source("sql_connection.R")
 rxSetComputeContext(local)
 
 
+##########################################################################################################################################
+
+## Function to get the top n rows of a table stored on SQL Server.
+## You can execute this function at any time during  your progress by removing the comment "#", and inputting:
+##  - the table name.
+##  - the number of rows you want to display.
+
+##########################################################################################################################################
+
+display_head <- function(table_name, n_rows){
+  table_sql <- RxSqlServerData(sqlQuery = sprintf("SELECT TOP(%s) * FROM %s", n_rows, table_name), connectionString = connection_string)
+  table <- rxImport(table_sql)
+  print(table)
+}
+
+# table_name <- "insert_table_name"
+# n_rows <- 10
+# display_head(table_name, n_rows)
+
+
 ##########################################################################################################################################
 
 ## Input: Point to the SQL table with the data set for modeling
@@ -48,31 +68,37 @@ column_info <- rxCreateColInfo(CM_AD)
 
 ##########################################################################################################################################
 
-# Write a splitting function to randomly split the data into a training set and a testing set, with proportion p.
-# It creates a random variable, Split_Vector. For each unique Lead_Id, it is equal to 1 with proportion p, and 0 otherwise. 
-# When p = 1, the observtion goes to the training set. When p = 0, it goes to the testing set.
+# Randomly split the data into a training set and a testing set, with a splitting % p.
+# p % goes to the training set, and the rest goes to the testing set. Default is 70%. 
 
-Splitting <- function(p = 0.70){
-  CM_AD1 <- RxSqlServerData(table = "CM_AD1", stringsAsFactors = T, connectionString = connection_string)
-  rxDataStep(inData = CM_AD, outFile = CM_AD1, overwrite = TRUE,transforms = list(
-    Split_Vector = rbinom(.rxNumRows, 1, p)),
-    transformObjects = list(p = p))
-} 
+p <- "70" 
+
+## Open a connection with SQL Server to be able to write queries with the rxExecuteSQLDDL function.
+outOdbcDS <- RxOdbcData(table = "NewData", connectionString = connection_string, useFastRead=TRUE)
+rxOpen(outOdbcDS, "w")
+
+## Create the Train_Id table containing Lead_Id of training set. 
+rxExecuteSQLDDL(outOdbcDS, sSQLString = paste("DROP TABLE if exists Train_Id;", sep=""))
 
-Splitting(p = 0.7)
+rxExecuteSQLDDL(outOdbcDS, sSQLString = sprintf(
+  "SELECT Lead_Id
+   INTO Train_Id
+   FROM CM_AD
+   WHERE ABS(CAST(BINARY_CHECKSUM(Lead_ID, NEWID()) as int)) %s < %s ;"
+  ,"% 100", p ))
 
-# Point to the training set. It will be created on the fly when training models. 
+## Point to the training set. It will be created on the fly when training models. 
 CM_AD_Train <- RxSqlServerData(  
   sqlQuery = "SELECT *   
-              FROM CM_AD1 
-              WHERE Split_Vector = 1",
+              FROM CM_AD 
+              WHERE Lead_Id IN (SELECT Lead_Id from Train_Id)",
   connectionString = connection_string, colInfo = column_info)
 
-# Point to the testing set. It will be created on the fly when testing models. 
+## Point to the testing set. It will be created on the fly when testing models. 
 CM_AD_Test <- RxSqlServerData(  
   sqlQuery = "SELECT *   
-              FROM CM_AD1 
-              WHERE Split_Vector = 0",
+              FROM CM_AD 
+              WHERE Lead_Id NOT IN (SELECT Lead_Id from Train_Id)",
   connectionString = connection_string, colInfo = column_info)
 
 
@@ -85,7 +111,7 @@ CM_AD_Test <- RxSqlServerData(
 # Write the formula after removing variables not used in the modeling.
 variables_all <- rxGetVarNames(CM_AD_Train)
 variables_to_remove <- c("Lead_Id", "Phone_No", "Country", "Comm_Id", "Time_Stamp", "Category", "Launch_Date", "Focused_Geography",
-                         "Split_Vector", "Call_For_Action", "Product", "Campaign_Name")
+                         "Call_For_Action", "Product", "Campaign_Name")
 traning_variables <- variables_all[!(variables_all %in% c("Conversion_Flag", variables_to_remove))]
 formula <- as.formula(paste("Conversion_Flag ~", paste(traning_variables, collapse = "+")))
 
@@ -200,7 +226,7 @@ evaluate_model <- function(observed, predicted_probability, threshold, model_nam
 ##########################################################################################################################################
 
 # Make Predictions, then import them into R. The observed Conversion_Flag is kept through the argument extraVarsToWrite.
-Prediction_Table_RF <- RxSqlServerData(table = "Prediction_Table_RF", stringsAsFactors = T, connectionString = connection_string)
+Prediction_Table_RF <- RxSqlServerData(table = "Forest_Prediction", stringsAsFactors = T, connectionString = connection_string)
 rxPredict(forest_model, data = CM_AD_Test, outData = Prediction_Table_RF, overwrite = T, type = "prob",
           extraVarsToWrite = c("Conversion_Flag"))
 
@@ -222,7 +248,7 @@ Metrics_RF <- evaluate_model(observed = observed, predicted_probability = Predic
 ##########################################################################################################################################
 
 # Make Predictions, then import them into R. The observed Conversion_Flag is kept through the argument extraVarsToWrite.
-Prediction_Table_GBT <- RxSqlServerData(table = "Prediction_Table_GBT", stringsAsFactors = T, connectionString = connection_string)
+Prediction_Table_GBT <- RxSqlServerData(table = "Boosted_Prediction", stringsAsFactors = T, connectionString = connection_string)
 rxPredict(btree_model,data = CM_AD_Test, outData = Prediction_Table_GBT, overwrite = T, type="prob",
           extraVarsToWrite = c("Conversion_Flag"))
 

diff --git a/R/step4_campaign_recommendations.R b/R/step4_campaign_recommendations.R
@@ -27,6 +27,26 @@ source("sql_connection.R")
 rxSetComputeContext(local)
 
 
+##########################################################################################################################################
+
+## Function to get the top n rows of a table stored on SQL Server.
+## You can execute this function at any time during  your progress by removing the comment "#", and inputting:
+##  - the table name.
+##  - the number of rows you want to display.
+
+##########################################################################################################################################
+
+display_head <- function(table_name, n_rows){
+  table_sql <- RxSqlServerData(sqlQuery = sprintf("SELECT TOP(%s) * FROM %s", n_rows, table_name), connectionString = connection_string)
+  table <- rxImport(table_sql)
+  print(table)
+}
+
+# table_name <- "insert_table_name"
+# n_rows <- 10
+# display_head(table_name, n_rows)
+
+
 ##########################################################################################################################################
 
 ## Input: - Point to the SQL table with the whole data set 
@@ -57,6 +77,7 @@ if(best == "GBT"){
 
 ## Create a full data table with all the unique combinations of Day_of_Week, Channel, Time_Of_Day 
 
+
 ##########################################################################################################################################
 
 # Create a table with all the unique combinations of Day_of_Week, Channel, Time_Of_Day.
@@ -67,24 +88,27 @@ Unique_Combos <- merge(merge(Day_of_Week_unique, Channel_unique), Time_Of_Day_un
 colnames(Unique_Combos) <- c("Day_Of_Week", "Channel", "Time_Of_Day")
 
 # Export it to SQL
-Unique_Combos_sql <- RxSqlServerData(table = "Unique_Combos_sql", connectionString = connection_string)
+Unique_Combos_sql <- RxSqlServerData(table = "Unique_Combos", connectionString = connection_string)
 rxDataStep(inData = Unique_Combos, outFile = Unique_Combos_sql, overwrite = T)
 
 
 # We create a table that has, for each Lead_Id and its corresponding variables (except Day_of_Week, Channel, Time_Of_Day),
 # One row for each possible combination of Day_of_Week, Channel and Time_Of_Day.
-# This is a pointer. The table will be created on the fly while scoring. 
+# This is a pointer. The table will be created on the fly while scoring.
+
+# For a faster implementation, we are selecting only the top 10K customers. 
+# For a full solution, you can remove TOP(10000) from the query below. 
 
 AD_full_merged_sql <- RxSqlServerData(
   sqlQuery = "SELECT * 
               FROM (
-                    SELECT Lead_Id, Age, Annual_Income_Bucket, Credit_Score, State, No_Of_Dependents, Highest_Education, Ethnicity,
-                    No_Of_Children, Household_Size, Gender, Marital_Status, Campaign_Id, Product_Id, Term,
-                    No_of_people_covered, Premium, Payment_frequency, Amt_on_Maturity_Bin, Sub_Category, Campaign_Drivers,
-                    Tenure_Of_Campaign, Net_Amt_Insured, SMS_Count, Email_Count,  Call_Count, 
-                    Previous_Channel, Conversion_Flag
+                    SELECT TOP(10000) Lead_Id, Age, Annual_Income_Bucket, Credit_Score, State, No_Of_Dependents, Highest_Education,
+                           Ethnicity, No_Of_Children, Household_Size, Gender, Marital_Status, Campaign_Id, Product_Id, Term,
+                           No_Of_People_Covered, Premium, Payment_Frequency, Amt_On_Maturity_Bin, Sub_Category, Campaign_Drivers,
+                           Tenure_Of_Campaign, Net_Amt_Insured, SMS_Count, Email_Count,  Call_Count, 
+                           Previous_Channel, Conversion_Flag
                     FROM CM_AD) a,
-                    (SELECT * FROM Unique_Combos_sql) b", 
+                    (SELECT * FROM Unique_Combos) b", 
   stringsAsFactors = T, connectionString = connection_string, colInfo = column_info)
 
 
@@ -115,16 +139,16 @@ rxExecuteSQLDDL(outOdbcDS, sSQLString = paste("DROP TABLE if exists Recommended_
 , sep=""))
 
 rxExecuteSQLDDL(outOdbcDS, sSQLString = paste(
-"SELECT Lead_Id, Day_of_Week, Channel, Time_Of_Day, MaxProb
+"SELECT Lead_Id, Day_of_Week, Channel, Time_Of_Day, Max_Prob
  INTO Recommended_Combinations
  FROM (
-       SELECT maxp.Lead_Id, Day_of_Week, Channel, Time_Of_Day, MaxProb, 
+       SELECT maxp.Lead_Id, Day_of_Week, Channel, Time_Of_Day, Max_Prob, 
               ROW_NUMBER() OVER (partition by maxp.Lead_Id ORDER BY NEWID()) as RowNo
-       FROM ( SELECT Lead_Id, max([1_prob]) as MaxProb
+       FROM ( SELECT Lead_Id, max([1_prob]) as Max_Prob
               FROM Prob_Id
               GROUP BY Lead_Id) maxp
        JOIN Prob_Id 
-       ON (maxp.Lead_Id = Prob_Id.Lead_Id AND maxp.MaxProb = Prob_Id.[1_prob])
+       ON (maxp.Lead_Id = Prob_Id.Lead_Id AND maxp.Max_Prob = Prob_Id.[1_prob])
   ) candidates
   WHERE RowNo = 1;"
 , sep=""))
@@ -140,11 +164,11 @@ rxExecuteSQLDDL(outOdbcDS, sSQLString = paste("DROP TABLE if exists Recommendati
 , sep=""))
 
 rxExecuteSQLDDL(outOdbcDS, sSQLString = paste("
-SELECT Age, Annual_Income_Bucket, Credit_Score, Product, Campaign_Name as [Campaign Name], State,  
-       Conversion_Flag as Converts, CM_AD.Day_Of_Week as [Day of Week], CM_AD.Time_Of_Day as [Time of Day],
-       CM_AD.Channel, CM_AD.Lead_Id as [Lead ID], Recommended_Combinations.Day_Of_Week as [Recommended Day],
-       Recommended_Combinations.Time_Of_Day as [Recommended Time], Recommended_Combinations.MaxProb,
-       Recommended_Combinations.Channel as [Recommended Channel]
+SELECT Age, Annual_Income_Bucket, Credit_Score, Product, Campaign_Name, State,  
+       Conversion_Flag, CM_AD.Day_Of_Week, CM_AD.Time_Of_Day,
+       CM_AD.Channel, CM_AD.Lead_Id, Recommended_Combinations.Day_Of_Week as [Recommended_Day],
+       Recommended_Combinations.Time_Of_Day as [Recommended_Time], Recommended_Combinations.Max_Prob,
+       Recommended_Combinations.Channel as [Recommended_Channel]
 INTO Recommendations
 FROM CM_AD JOIN Recommended_Combinations
 ON CM_AD.Lead_Id = Recommended_Combinations.Lead_Id;"