Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# Load packages.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import revoscalepy as revoscale
from scipy.spatial import distance as sci_distance
from sklearn import cluster as sk_cluster



def perform_clustering():
################################################################################################

## Connect to DB and select data

################################################################################################

# Connection string to connect to SQL Server named instance.
conn_str = 'Driver=SQL Server;Server=localhost;Database=tpcxbb_1gb;Trusted_Connection=True;'

input_query = '''SELECT
ss_customer_sk AS customer,
ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
COALESCE(returns_count, 0) AS frequency
FROM
(
SELECT
ss_customer_sk,
-- return order ratio
COUNT(distinct(ss_ticket_number)) AS orders_count,
-- return ss_item_sk ratio
COUNT(ss_item_sk) AS orders_items,
-- return monetary amount ratio
SUM( ss_net_paid ) AS orders_money
FROM store_sales s
GROUP BY ss_customer_sk
) orders
LEFT OUTER JOIN
(
SELECT
sr_customer_sk,
-- return order ratio
count(distinct(sr_ticket_number)) as returns_count,
-- return ss_item_sk ratio
COUNT(sr_item_sk) as returns_items,
-- return monetary amount ratio
SUM( sr_return_amt ) AS returns_money
FROM store_returns
GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''


# Define the columns we wish to import.
column_info = {
"customer": {"type": "integer"},
"orderRatio": {"type": "integer"},
"itemsRatio": {"type": "integer"},
"frequency": {"type": "integer"}
}

data_source = revoscale.RxSqlServerData(sql_query=input_query, column_info=column_info,
connection_string=conn_str)

# import data source and convert to pandas dataframe.
customer_data = pd.DataFrame(revoscalepy.rx_import(data_source))
print("Data frame:", customer_data.head(n=20))

################################################################################################

## Determine number of clusters using the Elbow method

################################################################################################

cdata = customer_data
K = range(1, 20)
KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
centroids = (k.cluster_centers_ for k in KM)

D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids)
dist = (np.min(D, axis=1) for D in D_k)
avgWithinSS = [sum(d) / cdata.shape[0] for d in dist]
plt.plot(K, avgWithinSS, 'b*-')
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average within-cluster sum of squares')
plt.title('Elbow for KMeans clustering')
plt.show()


################################################################################################

## Perform clustering using Kmeans

################################################################################################

# It looks like k=4 is a good number to use based on the elbow graph.
n_clusters = 4

means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111)
columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"]
est = means_cluster.fit(customer_data[columns])
clusters = est.labels_
customer_data['cluster'] = clusters

# Print some data about the clusters:

# For each cluster, count the members.
for c in range(n_clusters):
cluster_members=customer_data[customer_data['cluster'] == c][:]
print('Cluster{}(n={}):'.format(c, len(cluster_members)))
print('-'* 17)

# Print mean values per cluster.
print(customer_data.groupby(['cluster']).mean())


perform_clustering()

Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
USE [tpcxbb_1gb]
GO

-- Stored procedure that performs customer clustering using Python and SQL Server ML Services
CREATE OR ALTER PROCEDURE [dbo].[py_generate_customer_return_clusters]
AS

BEGIN
DECLARE

-- Input query to generate the purchase history & return metrics
@input_query NVARCHAR(MAX) = N'
SELECT
ss_customer_sk AS customer,
CAST( (ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) ) AS FLOAT) AS orderRatio,
CAST( (ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) ) AS FLOAT) AS itemsRatio,
CAST( (ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) ) AS FLOAT) AS monetaryRatio,
CAST( (COALESCE(returns_count, 0)) AS FLOAT) AS frequency
FROM
(
SELECT
ss_customer_sk,
-- return order ratio
COUNT(distinct(ss_ticket_number)) AS orders_count,
-- return ss_item_sk ratio
COUNT(ss_item_sk) AS orders_items,
-- return monetary amount ratio
SUM( ss_net_paid ) AS orders_money
FROM store_sales s
GROUP BY ss_customer_sk
) orders
LEFT OUTER JOIN
(
SELECT
sr_customer_sk,
-- return order ratio
count(distinct(sr_ticket_number)) as returns_count,
-- return ss_item_sk ratio
COUNT(sr_item_sk) as returns_items,
-- return monetary amount ratio
SUM( sr_return_amt ) AS returns_money
FROM store_returns
GROUP BY sr_customer_sk
) returned ON ss_customer_sk=sr_customer_sk
'

EXEC sp_execute_external_script
@language = N'Python'
, @script = N'

import pandas as pd
from sklearn.cluster import KMeans

#We concluded in step2 in the tutorial that 4 would be a good number of clusters
n_clusters = 4

#Perform clustering
est = KMeans(n_clusters=n_clusters, random_state=111).fit(customer_data[["orderRatio","itemsRatio","monetaryRatio","frequency"]])
clusters = est.labels_
customer_data["cluster"] = clusters

#OutputDataSet = customer_data
'
, @input_data_1 = @input_query
, @input_data_1_name = N'customer_data'
,@output_data_1_name = N'customer_data'
with result sets (("Customer" int, "orderRatio" float,"itemsRatio" float,"monetaryRatio" float,"frequency" float,"cluster" float));
END;
GO


--Creating a table for storing the clustering data
DROP TABLE IF EXISTS [dbo].[py_customer_clusters];
GO
--Create a table to store the predictions in
CREATE TABLE [dbo].[py_customer_clusters](
[Customer] [bigint] NULL,
[OrderRatio] [float] NULL,
[itemsRatio] [float] NULL,
[monetaryRatio] [float] NULL,
[frequency] [float] NULL,
[cluster] [int] NULL,
) ON [PRIMARY]
GO

--Execute the clustering and insert results into table
INSERT INTO py_customer_clusters
EXEC [dbo].[py_generate_customer_return_clusters];

-- Select contents of the table
SELECT * FROM py_customer_clusters;

--Get email addresses of customers in cluster 0
SELECT customer.[c_email_address], customer.c_customer_sk
FROM dbo.customer
JOIN
[dbo].[py_customer_clusters] as c
ON c.Customer = customer.c_customer_sk
WHERE c.cluster = 0;

8 changes: 6 additions & 2 deletions samples/features/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@ Master Data Services (MDS) is the SQL Server solution for master data management

[R Services](r-services)

SQL Server R Services brings R processing close to the data, allowing more scalable and more efficient predictive analytics.
SQL Server R Services (in SQL Server 2016 and above) brings R processing close to the data, allowing more scalable and more efficient predictive analytics using R in-database.

[ML Services](ml-services)

SQL Server ML Services (SQL Server 2017) brings Python processing close to the data, allowing more scalable and more efficient predictive analytics using Python in-database.

[JSON Support](json)

Expand All @@ -24,4 +28,4 @@ Built-in temporal functions enable you to easily track history of changes in a t

[Reporting Services (SSRS)](reporting-services)

Reporting Services provides reporting capabilities for your organziation. Reporting Services can be integrated with SharePoint Server or used as a standalone service.
Reporting Services provides reporting capabilities for your organziation. Reporting Services can be integrated with SharePoint Server or used as a standalone service.