Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@



# Define the connection string
connStr <- paste("Driver=SQL Server;Server=", "MyServer", ";Database=", "tpcx1b", ";Trusted_Connection=true;", sep = "");

# Input Query
input_query <- "
SELECT
ss_customer_sk AS customer,
round(CASE WHEN ((orders_count = 0) OR (returns_count IS NULL) OR (orders_count IS NULL) OR ((returns_count / orders_count) IS NULL) ) THEN 0.0 ELSE (cast(returns_count as nchar(10)) / orders_count) END, 7) AS orderRatio,
round(CASE WHEN ((orders_items = 0) OR(returns_items IS NULL) OR (orders_items IS NULL) OR ((returns_items / orders_items) IS NULL) ) THEN 0.0 ELSE (cast(returns_items as nchar(10)) / orders_items) END, 7) AS itemsRatio,
round(CASE WHEN ((orders_money = 0) OR (returns_money IS NULL) OR (orders_money IS NULL) OR ((returns_money / orders_money) IS NULL) ) THEN 0.0 ELSE (cast(returns_money as nchar(10)) / orders_money) END, 7) AS monetaryRatio,
round(CASE WHEN ( returns_count IS NULL ) THEN 0.0 ELSE returns_count END, 0) AS frequency

FROM
(
SELECT
ss_customer_sk,
-- return order ratio
COUNT(distinct(ss_ticket_number)) AS orders_count,
-- return ss_item_sk ratio
COUNT(ss_item_sk) AS orders_items,
-- return monetary amount ratio
SUM( ss_net_paid ) AS orders_money
FROM store_sales s
GROUP BY ss_customer_sk
) orders
LEFT OUTER JOIN
(
SELECT
sr_customer_sk,
-- return order ratio
count(distinct(sr_ticket_number)) as returns_count,
-- return ss_item_sk ratio
COUNT(sr_item_sk) as returns_items,
-- return monetary amount ratio
SUM( sr_return_amt ) AS returns_money
FROM store_returns
GROUP BY sr_customer_sk
) returned ON ss_customer_sk=sr_customer_sk
"
# Input customer data that needs to be classified
customer_returns <- RxSqlServerData(sqlQuery = input_query,
colClasses = c(customer = "numeric", orderRatio = "numeric", itemsRatio = "numeric", monetaryRatio = "numeric", frequency = "numeric"),
connectionString = connStr);


# Transform the data from an input dataset to an output dataset
customer_data <- rxDataStep(customer_returns);
#Look at the data we just loaded from SQL Server
head(customer_data, n = 5);

# Determine number of clusters
#Using a plot of the within groups sum of squares by number of clusters extracted can help determine the appropriate number of clusters.
#We are looking for a bend in the plot. It is at this "elbow" in the plot that we have the appropriate number of clusters
wss <- (nrow(customer_data) - 1) * sum(apply(customer_data, 2, var))
for (i in 2:20) {
xt = kmeans(customer_data, centers = i)
wss[i] <- sum(kms = kmeans(customer_data, centers = i)$withinss)
}
plot(1:20, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")

# Output table to hold the customer group mappings
return_cluster = RxSqlServerData(table = "return_cluster", connectionString = connStr);

# Set.seed for random number generator for predicatability
set.seed(10);

# Generate clusters using rxKmeans and output key / cluster to a table in SQL Server called return_cluster
clust <- rxKmeans( ~ orderRatio + itemsRatio + monetaryRatio + frequency, customer_returns, numClusters = 4
, outFile = return_cluster, outColName = "cluster", extraVarsToWrite = c("customer"), overwrite = TRUE);

# Read the custome returns cluster table
customer_cluster <- rxDataStep(return_cluster);

#Plot the clusters (need to install library "cluster")
#install.packages("cluster")
library("cluster");
clusplot(customer_data, customer_cluster$cluster, color=TRUE, shade=TRUE, labels=4, lines=0, plotchar = TRUE);

#Look at the clustering details and analyze results
clust



Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
USE [tpcxbb_1gb]

DROP PROC IF EXISTS generate_customer_return_clusters;
GO
CREATE procedure [dbo].[generate_customer_return_clusters]
AS
/*
This procedure uses R to classify customers into different groups based on their
purchase & return history.
*/
BEGIN
DECLARE @duration FLOAT
, @predict_duration FLOAT
, @instance_name NVARCHAR(100) = @@SERVERNAME
, @database_name NVARCHAR(128) = db_name()

-- Input query to generate the purchase history & return metrics
, @input_query NVARCHAR(MAX) = N'
SELECT
ss_customer_sk AS customer,
round(CASE WHEN ((orders_count = 0) OR (returns_count IS NULL) OR (orders_count IS NULL) OR ((returns_count / orders_count) IS NULL) ) THEN 0.0 ELSE (cast(returns_count as nchar(10)) / orders_count) END, 7) AS orderRatio,
round(CASE WHEN ((orders_items = 0) OR(returns_items IS NULL) OR (orders_items IS NULL) OR ((returns_items / orders_items) IS NULL) ) THEN 0.0 ELSE (cast(returns_items as nchar(10)) / orders_items) END, 7) AS itemsRatio,
round(CASE WHEN ((orders_money = 0) OR (returns_money IS NULL) OR (orders_money IS NULL) OR ((returns_money / orders_money) IS NULL) ) THEN 0.0 ELSE (cast(returns_money as nchar(10)) / orders_money) END, 7) AS monetaryRatio,
round(CASE WHEN ( returns_count IS NULL ) THEN 0.0 ELSE returns_count END, 0) AS frequency

FROM
(
SELECT
ss_customer_sk,
-- return order ratio
COUNT(distinct(ss_ticket_number)) AS orders_count,
-- return ss_item_sk ratio
COUNT(ss_item_sk) AS orders_items,
-- return monetary amount ratio
SUM( ss_net_paid ) AS orders_money
FROM store_sales s
GROUP BY ss_customer_sk
) orders
LEFT OUTER JOIN
(
SELECT
sr_customer_sk,
-- return order ratio
count(distinct(sr_ticket_number)) as returns_count,
-- return ss_item_sk ratio
COUNT(sr_item_sk) as returns_items,
-- return monetary amount ratio
SUM( sr_return_amt ) AS returns_money
FROM store_returns
GROUP BY sr_customer_sk
) returned ON ss_customer_sk=sr_customer_sk
'

EXEC sp_execute_external_script
@language = N'R'
, @script = N'
# Define the connection string
connStr <- paste("Driver=SQL Server;Server=", instance_name, ";Database=", database_name, ";Trusted_Connection=true;", sep="");


# Input customer data that needs to be classified. This is the result we get from our query
customer_returns <- RxSqlServerData(sqlQuery = input_query,
colClasses = c(customer = "numeric", orderRatio = "numeric", itemsRatio = "numeric", monetaryRatio = "numeric", frequency = "numeric"),
connectionString = connStr);

# Output table to hold the customer cluster mappings
return_cluster = RxSqlServerData(table = "customer_return_clusters", connectionString = connStr);

# set.seed for random number generator for predicatability
set.seed(10);

# generate clusters using rxKmeans and output clusters to a table called "customer_return_clusters".
clust <- rxKmeans( ~ orderRatio + itemsRatio + monetaryRatio + frequency, customer_returns, numClusters = 4
, outFile = return_cluster, outColName = "cluster", writeModelVars = TRUE , extraVarsToWrite = c("customer"), overwrite = TRUE);
'
, @input_data_1 = N''
, @params = N'@instance_name nvarchar(100), @database_name nvarchar(128), @input_query nvarchar(max), @duration float OUTPUT'
, @instance_name = @instance_name
, @database_name = @database_name
, @input_query = @input_query
, @duration = @duration OUTPUT;
END;

GO


--Empty table of the results before running the stored procedure
TRUNCATE TABLE customer_return_clusters;

--Execute the clustering. This will load the table customer_return_clusters with cluster mappings
EXEC [dbo].[generate_customer_return_clusters];

--Now select data from table customer_return_clusters to verify that the clustering data was loaded
SELECT * FROM customer_return_clusters;

--Select email addresses of customers in cluster 1
SELECT customer.[c_email_address], customer.c_customer_sk
FROM dbo.customer
JOIN
[dbo].[customer_return_clusters] as r
ON r.customer = customer.c_customer_sk
WHERE r.cluster = 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Perform customer clustering with SQL Server R Services

In this sample, we are going to get ourselves familiar with clustering.
Clustering can be explained as organizing data into groups where members of a group are similar in some way.

### Contents

[About this sample](#about-this-sample)<br/>
[Before you begin](#before-you-begin)<br/>
[Sample details](#sample-details)<br/>
[Related links](#related-links)<br/>


<a name=about-this-sample></a>

## About this sample

We will be using the Kmeans algorithm to perform the clustering of customers. This can for example be used to target a specific group of customers for marketing efforts.
Kmeans clustering is an unsupervised learning algorithm that tries to group data based on similarities. Unsupervised learning means that there is no outcome to be predicted, and the algorithm just tries to find patterns in the data.

In this sample, you will learn how to perform Kmeans clustering in R and deploying the solution in SQL Server 2016.

Follow the step by step tutorial [here](https://www.microsoft.com/en-us/sql-server/developer-get-started/rclustering) to walk through this sample.

<!-- Delete the ones that don't apply -->
- **Applies to:** SQL Server 2016 (or higher)
- **Key features:**
- **Workload:** SQL Server R Services
- **Programming Language:** T-SQL, R
- **Authors:** Nellie Gustafsson
- **Update history:** Getting started tutorial for R Services

<a name=before-you-begin></a>

## Before you begin

To run this sample, you need the following prerequisites.
Section 1 in the [tutorial](https://www.microsoft.com/en-us/sql-server/developer-get-started/rclustering) covers the prerequisites.
After that, you can download a DB backup file and restore it using Setup.sql. [Download DB](https://deve2e.azureedge.net/sqlchoice/static/tpcxbb_1gb.bak)

**Software prerequisites:**

<!-- Examples -->
1. SQL Server 2016 (or higher) with R Services installed
2. SQL Server Management Studio
3. R IDE Tool like Visual Studio


<a name=sample-details></a>
## Sample Details

### Customer Clustering.R

The R script that performs clustering.

### Customer Clustering.SQL

The SQL code to create stored procedure that performs clustering, and queries to verify and take further actions.


<a name=related-links></a>

## Related Links
<!-- Links to more articles. Remember to delete "en-us" from the link path. -->

For additional content, see these articles:

[SQL Server R Services - Upgrade and Installation FAQ](https://msdn.microsoft.com/en-us/library/mt653951.aspx)

[Other SQL Server R Services Tutorials](https://msdn.microsoft.com/en-us/library/mt591993.aspx)
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
-- Before we start, we need to restore the DB for this tutorial.
-- Step1: Download the compressed backup file
-- Save the file on a location where SQL Server can access it. For example: C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\Backup\
-- In a new query window in SSMS, execute the following restore statement, but REMEMBER TO CHANGE THE FILE PATHS
-- to match the directories of your installation!
USE master;
GO
RESTORE DATABASE tpcxbb_1gb
FROM DISK = 'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\Backup\tpcxbb_1gb.bak'
WITH
MOVE 'tpcxbb_1gb' TO 'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\DATA\tpcxbb_1gb.mdf'
,MOVE 'tpcxbb_1gb_log' TO 'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\DATA\tpcxbb_1gb.ldf';
GO
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#Connection string to connect to SQL Server
connStr <- paste("Driver=SQL Server; Server=", "MyServer",
";Database=", "tutorialdb", ";Trusted_Connection=true;", sep = "");

#Get the data from SQL Server Table
SQL_rentaldata <- RxSqlServerData(table = "dbo.rental_data",
connectionString = connStr, returnDataFrame = TRUE);

#Import the data into a data frame
rentaldata <- rxImport(SQL_rentaldata);

#Let's see the structure of the data and the top rows
head(rentaldata);
str(rentaldata);

#Changing the three factor columns to factor types
#This helps when building the model because we are explicitly saying that these values are categorical
rentaldata$Holiday <- factor(rentaldata$Holiday);
rentaldata$Snow <- factor(rentaldata$Snow);
rentaldata$WeekDay <- factor(rentaldata$WeekDay);

#Visualize the dataset after the change
str(rentaldata);

#Now let's split the dataset into 2 different sets
#One set for training the model and the other for validating it
train_data = rentaldata[rentaldata$Year < 2015,];
test_data = rentaldata[rentaldata$Year == 2015,];

#Use this column to check the quality of the prediction against actual values
actual_counts <- test_data$RentalCount;

#Model 1: Use rxLinMod to create a linear regression model. We are training the data using the training data set
model_linmod <- rxLinMod(RentalCount ~ Month + Day + WeekDay + Snow + Holiday, data = train_data);

#Model 2: Use rxDTree to create a decision tree model. We are training the data using the training data set
model_dtree <- rxDTree(RentalCount ~ Month + Day + WeekDay + Snow + Holiday, data = train_data);

#Use the models we just created to predict using the test data set.
#That enables us to compare actual values of RentalCount from the two models and compare to the actual values in the test data set
predict_linmod <- rxPredict(model_linmod, test_data, writeModelVars = TRUE);

predict_dtree <- rxPredict(model_dtree, test_data, writeModelVars = TRUE);

#Look at the top rows of the two prediction data sets.
head(predict_linmod);
head(predict_dtree);

#Now we will use the plotting functionality in R to viusalize the results from the predictions
#We are plotting the difference between actual and predicted values for both models to compare accuracy
par(mfrow = c(2, 1));
plot(predict_linmod$RentalCount_Pred - predict_linmod$RentalCount, main = "Difference between actual and predicted. rxLinmod");
plot(predict_dtree$RentalCount_Pred - predict_dtree$RentalCount, main = "Difference between actual and predicted. rxDTree");

Loading