microsoft · uc-msft · Jan 27, 2017 · Jan 27, 2017 · Jan 27, 2017 · Jan 27, 2017
diff --git a/samples/features/r-services/Getting-Started/Customer-Clustering/Customer Clustering.R b/samples/features/r-services/Getting-Started/Customer-Clustering/Customer Clustering.R
@@ -0,0 +1,86 @@
+
+
+
+# Define the connection string
+connStr <- paste("Driver=SQL Server;Server=", "MyServer", ";Database=", "tpcx1b", ";Trusted_Connection=true;", sep = "");
+
+# Input Query
+input_query <- "
+	SELECT
+  ss_customer_sk AS customer,
+  round(CASE WHEN ((orders_count = 0) OR (returns_count IS NULL) OR (orders_count IS NULL) OR ((returns_count / orders_count) IS NULL) ) THEN 0.0 ELSE (cast(returns_count as nchar(10)) / orders_count) END, 7) AS orderRatio,
+  round(CASE WHEN ((orders_items = 0) OR(returns_items IS NULL) OR (orders_items IS NULL) OR ((returns_items / orders_items) IS NULL) ) THEN 0.0 ELSE (cast(returns_items as nchar(10)) / orders_items) END, 7) AS itemsRatio,
+  round(CASE WHEN ((orders_money = 0) OR (returns_money IS NULL) OR (orders_money IS NULL) OR ((returns_money / orders_money) IS NULL) ) THEN 0.0 ELSE (cast(returns_money as nchar(10)) / orders_money) END, 7) AS monetaryRatio,
+  round(CASE WHEN ( returns_count IS NULL                                                                        ) THEN 0.0 ELSE  returns_count                 END, 0) AS frequency
+
+FROM
+  (
+    SELECT
+      ss_customer_sk, 
+      -- return order ratio
+      COUNT(distinct(ss_ticket_number)) AS orders_count,
+      -- return ss_item_sk ratio
+      COUNT(ss_item_sk) AS orders_items,
+      -- return monetary amount ratio
+      SUM( ss_net_paid ) AS orders_money
+    FROM store_sales s
+    GROUP BY ss_customer_sk
+  ) orders
+  LEFT OUTER JOIN
+  (
+    SELECT
+      sr_customer_sk,
+      -- return order ratio
+      count(distinct(sr_ticket_number)) as returns_count,
+      -- return ss_item_sk ratio
+      COUNT(sr_item_sk) as returns_items,
+      -- return monetary amount ratio
+      SUM( sr_return_amt ) AS returns_money
+    FROM store_returns
+    GROUP BY sr_customer_sk
+  ) returned ON ss_customer_sk=sr_customer_sk 
+"
+# Input customer data that needs to be classified
+customer_returns <- RxSqlServerData(sqlQuery = input_query,
+                                        colClasses = c(customer = "numeric", orderRatio = "numeric", itemsRatio = "numeric", monetaryRatio = "numeric", frequency = "numeric"),
+                                    connectionString = connStr);
+
+
+# Transform the data from an input dataset to an output dataset
+customer_data <- rxDataStep(customer_returns);
+#Look at the data we just loaded from SQL Server
+head(customer_data, n = 5);
+
+# Determine number of clusters
+#Using a plot of the within groups sum of squares by number of clusters extracted can help determine the appropriate number of clusters.
+#We are looking for a bend in the plot. It is at this "elbow" in the plot that we have the appropriate number of clusters 
+wss <- (nrow(customer_data) - 1) * sum(apply(customer_data, 2, var))
+for (i in 2:20) { 
+xt = kmeans(customer_data, centers = i)
+wss[i] <- sum(kms = kmeans(customer_data, centers = i)$withinss)
+    }
+plot(1:20, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")
+
+# Output table to hold the customer group mappings
+return_cluster = RxSqlServerData(table = "return_cluster", connectionString = connStr);
+
+# Set.seed for random number generator for predicatability
+set.seed(10);
+
+# Generate clusters using rxKmeans and output key / cluster to a table in SQL Server called return_cluster
+clust <- rxKmeans( ~ orderRatio + itemsRatio + monetaryRatio + frequency, customer_returns, numClusters = 4
+                    , outFile = return_cluster, outColName = "cluster", extraVarsToWrite = c("customer"), overwrite = TRUE);
+
+# Read the custome returns cluster table
+customer_cluster <- rxDataStep(return_cluster);
+
+#Plot the clusters (need to install library "cluster")
+#install.packages("cluster")
+library("cluster");
+clusplot(customer_data, customer_cluster$cluster, color=TRUE, shade=TRUE, labels=4, lines=0, plotchar = TRUE);
+
+#Look at the clustering details and analyze results
+clust
+
+
+
diff --git a/samples/features/r-services/Getting-Started/Customer-Clustering/Customer Clustering.sql b/samples/features/r-services/Getting-Started/Customer-Clustering/Customer Clustering.sql
@@ -0,0 +1,102 @@
+USE [tpcxbb_1gb]
+
+DROP PROC IF EXISTS generate_customer_return_clusters;
+GO
+CREATE procedure [dbo].[generate_customer_return_clusters]
+AS
+/* 
+  This procedure uses R to classify customers into different groups based on their
+  purchase & return history.
+*/
+BEGIN
+	DECLARE @duration FLOAT
+	, @predict_duration FLOAT
+	, @instance_name NVARCHAR(100) = @@SERVERNAME
+	, @database_name NVARCHAR(128) = db_name()
+
+-- Input query to generate the purchase history & return metrics
+	, @input_query NVARCHAR(MAX) = N'
+SELECT
+  ss_customer_sk AS customer,
+  round(CASE WHEN ((orders_count = 0) OR (returns_count IS NULL) OR (orders_count IS NULL) OR ((returns_count / orders_count) IS NULL) ) THEN 0.0 ELSE (cast(returns_count as nchar(10)) / orders_count) END, 7) AS orderRatio,
+  round(CASE WHEN ((orders_items = 0) OR(returns_items IS NULL) OR (orders_items IS NULL) OR ((returns_items / orders_items) IS NULL) ) THEN 0.0 ELSE (cast(returns_items as nchar(10)) / orders_items) END, 7) AS itemsRatio,
+  round(CASE WHEN ((orders_money = 0) OR (returns_money IS NULL) OR (orders_money IS NULL) OR ((returns_money / orders_money) IS NULL) ) THEN 0.0 ELSE (cast(returns_money as nchar(10)) / orders_money) END, 7) AS monetaryRatio,
+  round(CASE WHEN ( returns_count IS NULL                                                                        ) THEN 0.0 ELSE  returns_count                 END, 0) AS frequency
+
+FROM
+  (
+    SELECT
+      ss_customer_sk,
+      -- return order ratio
+      COUNT(distinct(ss_ticket_number)) AS orders_count,
+      -- return ss_item_sk ratio
+      COUNT(ss_item_sk) AS orders_items,
+      -- return monetary amount ratio
+      SUM( ss_net_paid ) AS orders_money
+    FROM store_sales s
+    GROUP BY ss_customer_sk
+  ) orders
+  LEFT OUTER JOIN
+  (
+    SELECT
+      sr_customer_sk,
+      -- return order ratio
+      count(distinct(sr_ticket_number)) as returns_count,
+      -- return ss_item_sk ratio
+      COUNT(sr_item_sk) as returns_items,
+      -- return monetary amount ratio
+      SUM( sr_return_amt ) AS returns_money
+    FROM store_returns
+    GROUP BY sr_customer_sk
+  ) returned ON ss_customer_sk=sr_customer_sk 
+ '
+
+EXEC sp_execute_external_script
+	  @language = N'R'
+	, @script = N'
+# Define the connection string
+connStr <- paste("Driver=SQL Server;Server=", instance_name, ";Database=", database_name, ";Trusted_Connection=true;", sep="");
+
+
+# Input customer data that needs to be classified. This is the result we get from our query
+customer_returns <- RxSqlServerData(sqlQuery = input_query,
+									colClasses = c(customer = "numeric", orderRatio = "numeric", itemsRatio = "numeric", monetaryRatio = "numeric", frequency = "numeric"),
+									connectionString = connStr);
+
+# Output table to hold the customer cluster mappings
+return_cluster = RxSqlServerData(table = "customer_return_clusters", connectionString = connStr);
+
+# set.seed for random number generator for predicatability
+set.seed(10);
+
+# generate clusters using rxKmeans and output clusters to a table called "customer_return_clusters". 
+clust <- rxKmeans( ~ orderRatio + itemsRatio + monetaryRatio + frequency, customer_returns, numClusters = 4
+                    , outFile = return_cluster, outColName = "cluster", writeModelVars = TRUE , extraVarsToWrite = c("customer"), overwrite = TRUE);
+'
+	, @input_data_1 = N''
+	, @params = N'@instance_name nvarchar(100), @database_name nvarchar(128), @input_query nvarchar(max), @duration float OUTPUT'
+	, @instance_name = @instance_name
+	, @database_name = @database_name
+	, @input_query = @input_query
+	, @duration = @duration OUTPUT;
+END;
+
+GO
+
+
+--Empty table of the results before running the stored procedure
+TRUNCATE TABLE customer_return_clusters;
+
+--Execute the clustering. This will load the table customer_return_clusters with cluster mappings
+EXEC [dbo].[generate_customer_return_clusters];
+
+--Now select data from table customer_return_clusters to verify that the clustering data was loaded
+SELECT * FROM customer_return_clusters;
+
+--Select email addresses of customers in cluster 1
+SELECT customer.[c_email_address], customer.c_customer_sk
+  FROM dbo.customer
+  JOIN 
+  [dbo].[customer_return_clusters] as r
+  ON r.customer = customer.c_customer_sk
+  WHERE r.cluster = 1
diff --git a/samples/features/r-services/Getting-Started/Customer-Clustering/README.md b/samples/features/r-services/Getting-Started/Customer-Clustering/README.md
@@ -0,0 +1,70 @@
+# Perform customer clustering with SQL Server R Services
+
+In this sample, we are going to get ourselves familiar with clustering. 
+Clustering can be explained as organizing data into groups where members of a group are similar in some way.
+
+### Contents
+
+[About this sample](#about-this-sample)<br/>
+[Before you begin](#before-you-begin)<br/>
+[Sample details](#sample-details)<br/>
+[Related links](#related-links)<br/>
+
+
+<a name=about-this-sample></a>
+
+## About this sample
+
+We will be using the Kmeans algorithm to perform the clustering of customers. This can for example be used to target a specific group of customers for marketing efforts. 
+Kmeans clustering is an unsupervised learning algorithm that tries to group data based on similarities. Unsupervised learning means that there is no outcome to be predicted, and the algorithm just tries to find patterns in the data.
+
+In this sample, you will learn how to perform Kmeans clustering in R and deploying the solution in SQL Server 2016.
+
+Follow the step by step tutorial [here](https://www.microsoft.com/en-us/sql-server/developer-get-started/rclustering) to walk through this sample.
+
+<!-- Delete the ones that don't apply -->
+- **Applies to:** SQL Server 2016 (or higher)
+- **Key features:**
+- **Workload:** SQL Server R Services
+- **Programming Language:** T-SQL, R
+- **Authors:** Nellie Gustafsson
+- **Update history:** Getting started tutorial for R Services
+
+<a name=before-you-begin></a>
+
+## Before you begin
+
+To run this sample, you need the following prerequisites.
+Section 1 in the [tutorial](https://www.microsoft.com/en-us/sql-server/developer-get-started/rclustering) covers the prerequisites.
+After that, you can download a DB backup file and restore it using Setup.sql. [Download DB](https://deve2e.azureedge.net/sqlchoice/static/tpcxbb_1gb.bak)
+
+**Software prerequisites:**
+
+<!-- Examples -->
+1. SQL Server 2016 (or higher) with R Services installed
+2. SQL Server Management Studio
+3. R IDE Tool like Visual Studio
+
+
+<a name=sample-details></a>
+## Sample Details
+
+### Customer Clustering.R
+
+The R script that performs clustering.
+
+### Customer Clustering.SQL
+
+The SQL code to create stored procedure that performs clustering, and queries to verify and take further actions.
+
+
+<a name=related-links></a>
+
+## Related Links
+<!-- Links to more articles. Remember to delete "en-us" from the link path. -->
+
+For additional content, see these articles:
+
+[SQL Server R Services - Upgrade and Installation FAQ](https://msdn.microsoft.com/en-us/library/mt653951.aspx)
+
+[Other SQL Server R Services Tutorials](https://msdn.microsoft.com/en-us/library/mt591993.aspx)
diff --git a/samples/features/r-services/Getting-Started/Customer-Clustering/Setup.sql b/samples/features/r-services/Getting-Started/Customer-Clustering/Setup.sql
@@ -0,0 +1,13 @@
+-- Before we start, we need to restore the DB for this tutorial.
+-- Step1: Download the compressed backup file
+-- Save the file on a location where SQL Server can access it. For example: C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\Backup\
+-- In a new query window in SSMS, execute the following restore statement, but REMEMBER TO CHANGE THE FILE PATHS
+-- to match the directories of your installation!
+USE master;
+GO
+RESTORE DATABASE tpcxbb_1gb
+   FROM DISK = 'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\Backup\tpcxbb_1gb.bak'
+   WITH
+                MOVE 'tpcxbb_1gb' TO 'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\DATA\tpcxbb_1gb.mdf'
+                ,MOVE 'tpcxbb_1gb_log' TO 'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\DATA\tpcxbb_1gb.ldf';
+GO
diff --git a/samples/features/r-services/Getting-Started/Predictive-Modeling/Predictive Model.R b/samples/features/r-services/Getting-Started/Predictive-Modeling/Predictive Model.R
@@ -0,0 +1,54 @@
+#Connection string to connect to SQL Server
+connStr <- paste("Driver=SQL Server; Server=", "MyServer",
+                 ";Database=", "tutorialdb", ";Trusted_Connection=true;", sep = "");
+
+#Get the data from SQL Server Table
+SQL_rentaldata <- RxSqlServerData(table = "dbo.rental_data",
+                              connectionString = connStr, returnDataFrame = TRUE);
+
+#Import the data into a data frame
+rentaldata <- rxImport(SQL_rentaldata);
+
+#Let's see the structure of the data and the top rows
+head(rentaldata);
+str(rentaldata);
+
+#Changing the three factor columns to factor types
+#This helps when building the model because we are explicitly saying that these values are categorical
+rentaldata$Holiday <- factor(rentaldata$Holiday);
+rentaldata$Snow <- factor(rentaldata$Snow);
+rentaldata$WeekDay <- factor(rentaldata$WeekDay);
+
+#Visualize the dataset after the change
+str(rentaldata);
+
+#Now let's split the dataset into 2 different sets
+#One set for training the model and the other for validating it
+train_data = rentaldata[rentaldata$Year < 2015,];
+test_data = rentaldata[rentaldata$Year == 2015,];
+
+#Use this column to check the quality of the prediction against actual values
+actual_counts <- test_data$RentalCount;
+
+#Model 1: Use rxLinMod to create a linear regression model. We are training the data using the training data set
+model_linmod <- rxLinMod(RentalCount ~ Month + Day + WeekDay + Snow + Holiday, data = train_data);
+
+#Model 2: Use rxDTree to create a decision tree model. We are training the data using the training data set
+model_dtree <- rxDTree(RentalCount ~ Month + Day + WeekDay + Snow + Holiday, data = train_data);
+
+#Use the models we just created to predict using the test data set.
+#That enables us to compare actual values of RentalCount from the two models and compare to the actual values in the test data set
+predict_linmod <- rxPredict(model_linmod, test_data, writeModelVars = TRUE);
+
+predict_dtree <- rxPredict(model_dtree, test_data, writeModelVars = TRUE);
+
+#Look at the top rows of the two prediction data sets.
+head(predict_linmod);
+head(predict_dtree);
+
+#Now we will use the plotting functionality in R to viusalize the results from the predictions
+#We are plotting the difference between actual and predicted values for both models to compare accuracy
+par(mfrow = c(2, 1));
+plot(predict_linmod$RentalCount_Pred - predict_linmod$RentalCount, main = "Difference between actual and predicted. rxLinmod");
+plot(predict_dtree$RentalCount_Pred - predict_dtree$RentalCount, main = "Difference between actual and predicted. rxDTree");
+