diff --git a/samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.py b/samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.py new file mode 100644 index 0000000000..69f1dae9fd --- /dev/null +++ b/samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.py @@ -0,0 +1,119 @@ +# Load packages. +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import revoscalepy as revoscale +from scipy.spatial import distance as sci_distance +from sklearn import cluster as sk_cluster + + + +def perform_clustering(): + ################################################################################################ + + ## Connect to DB and select data + + ################################################################################################ + + # Connection string to connect to SQL Server named instance. + conn_str = 'Driver=SQL Server;Server=localhost;Database=tpcxbb_1gb;Trusted_Connection=True;' + + input_query = '''SELECT + ss_customer_sk AS customer, + ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio, + ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio, + ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio, + COALESCE(returns_count, 0) AS frequency + FROM + ( + SELECT + ss_customer_sk, + -- return order ratio + COUNT(distinct(ss_ticket_number)) AS orders_count, + -- return ss_item_sk ratio + COUNT(ss_item_sk) AS orders_items, + -- return monetary amount ratio + SUM( ss_net_paid ) AS orders_money + FROM store_sales s + GROUP BY ss_customer_sk + ) orders + LEFT OUTER JOIN + ( + SELECT + sr_customer_sk, + -- return order ratio + count(distinct(sr_ticket_number)) as returns_count, + -- return ss_item_sk ratio + COUNT(sr_item_sk) as returns_items, + -- return monetary amount ratio + SUM( sr_return_amt ) AS returns_money + FROM store_returns + GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk''' + + + # Define the columns we wish to import. + column_info = { + "customer": {"type": "integer"}, + "orderRatio": {"type": "integer"}, + "itemsRatio": {"type": "integer"}, + "frequency": {"type": "integer"} + } + + data_source = revoscale.RxSqlServerData(sql_query=input_query, column_info=column_info, + connection_string=conn_str) + + # import data source and convert to pandas dataframe. + customer_data = pd.DataFrame(revoscalepy.rx_import(data_source)) + print("Data frame:", customer_data.head(n=20)) + + ################################################################################################ + + ## Determine number of clusters using the Elbow method + + ################################################################################################ + + cdata = customer_data + K = range(1, 20) + KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K) + centroids = (k.cluster_centers_ for k in KM) + + D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids) + dist = (np.min(D, axis=1) for D in D_k) + avgWithinSS = [sum(d) / cdata.shape[0] for d in dist] + plt.plot(K, avgWithinSS, 'b*-') + plt.grid(True) + plt.xlabel('Number of clusters') + plt.ylabel('Average within-cluster sum of squares') + plt.title('Elbow for KMeans clustering') + plt.show() + + + ################################################################################################ + + ## Perform clustering using Kmeans + + ################################################################################################ + + # It looks like k=4 is a good number to use based on the elbow graph. + n_clusters = 4 + + means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111) + columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"] + est = means_cluster.fit(customer_data[columns]) + clusters = est.labels_ + customer_data['cluster'] = clusters + + # Print some data about the clusters: + + # For each cluster, count the members. + for c in range(n_clusters): + cluster_members=customer_data[customer_data['cluster'] == c][:] + print('Cluster{}(n={}):'.format(c, len(cluster_members))) + print('-'* 17) + + # Print mean values per cluster. + print(customer_data.groupby(['cluster']).mean()) + + +perform_clustering() + diff --git a/samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.sql b/samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.sql new file mode 100644 index 0000000000..1c3af44a04 --- /dev/null +++ b/samples/features/machine-learning-services/python/getting-started/customer-clustering/customer_clustering.sql @@ -0,0 +1,100 @@ +USE [tpcxbb_1gb] +GO + +-- Stored procedure that performs customer clustering using Python and SQL Server ML Services +CREATE OR ALTER PROCEDURE [dbo].[py_generate_customer_return_clusters] +AS + +BEGIN + DECLARE + +-- Input query to generate the purchase history & return metrics + @input_query NVARCHAR(MAX) = N' +SELECT + ss_customer_sk AS customer, + CAST( (ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) ) AS FLOAT) AS orderRatio, + CAST( (ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) ) AS FLOAT) AS itemsRatio, + CAST( (ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) ) AS FLOAT) AS monetaryRatio, + CAST( (COALESCE(returns_count, 0)) AS FLOAT) AS frequency +FROM + ( + SELECT + ss_customer_sk, + -- return order ratio + COUNT(distinct(ss_ticket_number)) AS orders_count, + -- return ss_item_sk ratio + COUNT(ss_item_sk) AS orders_items, + -- return monetary amount ratio + SUM( ss_net_paid ) AS orders_money + FROM store_sales s + GROUP BY ss_customer_sk + ) orders + LEFT OUTER JOIN + ( + SELECT + sr_customer_sk, + -- return order ratio + count(distinct(sr_ticket_number)) as returns_count, + -- return ss_item_sk ratio + COUNT(sr_item_sk) as returns_items, + -- return monetary amount ratio + SUM( sr_return_amt ) AS returns_money + FROM store_returns + GROUP BY sr_customer_sk + ) returned ON ss_customer_sk=sr_customer_sk + ' + +EXEC sp_execute_external_script + @language = N'Python' + , @script = N' + +import pandas as pd +from sklearn.cluster import KMeans + +#We concluded in step2 in the tutorial that 4 would be a good number of clusters +n_clusters = 4 + +#Perform clustering +est = KMeans(n_clusters=n_clusters, random_state=111).fit(customer_data[["orderRatio","itemsRatio","monetaryRatio","frequency"]]) +clusters = est.labels_ +customer_data["cluster"] = clusters + +#OutputDataSet = customer_data +' + , @input_data_1 = @input_query + , @input_data_1_name = N'customer_data' + ,@output_data_1_name = N'customer_data' + with result sets (("Customer" int, "orderRatio" float,"itemsRatio" float,"monetaryRatio" float,"frequency" float,"cluster" float)); +END; +GO + + +--Creating a table for storing the clustering data +DROP TABLE IF EXISTS [dbo].[py_customer_clusters]; +GO +--Create a table to store the predictions in +CREATE TABLE [dbo].[py_customer_clusters]( + [Customer] [bigint] NULL, + [OrderRatio] [float] NULL, + [itemsRatio] [float] NULL, + [monetaryRatio] [float] NULL, + [frequency] [float] NULL, + [cluster] [int] NULL, + ) ON [PRIMARY] +GO + +--Execute the clustering and insert results into table +INSERT INTO py_customer_clusters +EXEC [dbo].[py_generate_customer_return_clusters]; + +-- Select contents of the table +SELECT * FROM py_customer_clusters; + +--Get email addresses of customers in cluster 0 +SELECT customer.[c_email_address], customer.c_customer_sk + FROM dbo.customer + JOIN + [dbo].[py_customer_clusters] as c + ON c.Customer = customer.c_customer_sk + WHERE c.cluster = 0; + diff --git a/samples/features/readme.md b/samples/features/readme.md index 54139895ba..944ef2c2be 100644 --- a/samples/features/readme.md +++ b/samples/features/readme.md @@ -10,7 +10,11 @@ Master Data Services (MDS) is the SQL Server solution for master data management [R Services](r-services) -SQL Server R Services brings R processing close to the data, allowing more scalable and more efficient predictive analytics. +SQL Server R Services (in SQL Server 2016 and above) brings R processing close to the data, allowing more scalable and more efficient predictive analytics using R in-database. + +[ML Services](ml-services) + +SQL Server ML Services (SQL Server 2017) brings Python processing close to the data, allowing more scalable and more efficient predictive analytics using Python in-database. [JSON Support](json) @@ -24,4 +28,4 @@ Built-in temporal functions enable you to easily track history of changes in a t [Reporting Services (SSRS)](reporting-services) -Reporting Services provides reporting capabilities for your organziation. Reporting Services can be integrated with SharePoint Server or used as a standalone service. \ No newline at end of file +Reporting Services provides reporting capabilities for your organziation. Reporting Services can be integrated with SharePoint Server or used as a standalone service.