# Azure-Synapse-Retail-Recommender-Solution-Accelerator

_This accelerator is built to provide developers to quickly build a Retail Recommender Solution based on Azure Synapse Analytics Platform._

![image-alt-text](https://d33wubrfki0l68.cloudfront.net/0d75c87ae627b0413623ca5d07c6be5add5a7bb1/052bd/assets/images/posts/carts.jpg)

**Important – Do not use in production, for demonstration purposes only – please review the legal notices before continuing**

In [169]:
# Check synapse version
import sys
print(sys.version)

StatementMeta(RetailSparkPool, 12, 109, Finished, Available)

3.6.11 | packaged by conda-forge | (default, Aug  5 2020, 20:04:48) 
[GCC 7.5.0]

In [170]:
# loading required packages
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil import parser
from pyspark.sql.functions import unix_timestamp

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from pyspark.ml.feature import RFormula
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator

StatementMeta(RetailSparkPool, 12, 110, Finished, Available)

### Data Ingestion
###### Read Spark table as a Spark dataframe
Loading Order, Customer and Product data from WWIRetailLakeDB

In [171]:
%%pyspark
df_order = spark.sql("SELECT * FROM WWIRetailLakeDB.Order")
df_order = df_order.drop("ItemSku", "EntityCode", "Brand")
print("Order =======================================================")
print(df_order.columns)
print(df_order.count())

df_brand = spark.sql("SELECT * FROM WWIRetailLakeDB.Brand")
print("Brand =======================================================")
print(df_brand.columns)
print(df_brand.count())

df_customer = spark.sql("SELECT * FROM WWIRetailLakeDB.Customer")
df_customer = df_customer.drop("EntityCode", "OrderId")
print("Customer =======================================================")
print(df_customer.columns)
print(df_customer.count())

df_product = spark.sql("SELECT * FROM WWIRetailLakeDB.Product")
print("Product =======================================================")
print(df_product.columns)
print(df_product.count())

StatementMeta(RetailSparkPool, 12, 111, Finished, Available)

['OrderId', 'CustomerId', 'ProductSalesPriceAmount', 'ProductListPriceAmount', 'TotalShippingChargeAmount', 'OrderTotalTaxAmount', 'OrderStatus', 'Quantity', 'ProductAdjustmentAmount', 'ProductAdjustmentPercentage', 'ProductId', 'QuantityBooked', 'QuantityBilled']
98880
['BrandName', 'BrandDescription', 'EntityCode', 'BrandId']
14
['CustomerId', 'CustomerFirstName', 'CustomerMiddleName', 'CustomerLastName', 'CustomerFullName', 'CustomerDOB']
105916
['ProductCategories', 'ProductName', 'ProductId', 'ItemSku', 'ProductShortDescription', 'ProductDescription', 'BrandName', 'EntityCode']
212

In [172]:
# Combining customer, order and product data
df = df_order.join(df_product, on='ProductId')
df = df.join(df_customer, on='CustomerId')

print("Columns: ")
print(df.columns)
print("No. of rows:", df.count())

StatementMeta(RetailSparkPool, 12, 112, Finished, Available)

Columns: 
['CustomerId', 'ProductId', 'OrderId', 'ProductSalesPriceAmount', 'ProductListPriceAmount', 'TotalShippingChargeAmount', 'OrderTotalTaxAmount', 'OrderStatus', 'Quantity', 'ProductAdjustmentAmount', 'ProductAdjustmentPercentage', 'QuantityBooked', 'QuantityBilled', 'ProductCategories', 'ProductName', 'ItemSku', 'ProductShortDescription', 'ProductDescription', 'BrandName', 'EntityCode', 'CustomerFirstName', 'CustomerMiddleName', 'CustomerLastName', 'CustomerFullName', 'CustomerDOB']
No. of rows: 1594430

In [173]:
# top 5 brand
top_brand = df[df["EntityCode"] == 'WideWorldImporters'].groupBy('BrandName').count().sort('count', ascending=False).limit(5) # only keep top 5 categories
#top_category = top_category.withColumnRenamed("category_code","category_code_tmp")
display(top_brand)

StatementMeta(RetailSparkPool, 12, 113, Finished, Available)

SynapseWidget(Synapse.DataFrame, 9a302add-850f-499c-acbf-e93a070c3e42)

In [174]:
# Top 5 most bought products
product_count = df.groupBy('ProductId').count()
product_count.show(5)

StatementMeta(RetailSparkPool, 12, 114, Finished, Available)

+---------+-----+
|ProductId|count|
+---------+-----+
|      125| 7521|
|      124| 7555|
|        7|15000|
|       51|15014|
|       54|15058|
+---------+-----+
only showing top 5 rows

In [175]:
raw_df = df

product_count = df.groupBy('ProductId').count()
#product_count = product_count.filter("count >= 3000").orderBy('count', ascending=False) #only counts when the product has 20000 views

raw_df = raw_df.withColumnRenamed("ProductId","product_id_tmp")
raw_df = raw_df.join(product_count, raw_df.product_id_tmp == product_count.ProductId)

user_count = df.groupBy('CustomerId').count()
#user_count = user_count.filter("count >= 20").orderBy('count', ascending=False) #only counts when the user has 100 clicks

raw_df = raw_df.withColumnRenamed("CustomerId","user_id_tmp")
raw_df = raw_df.join(user_count, raw_df.user_id_tmp == user_count.CustomerId)

df = raw_df

df = df.where(df.EntityCode == "Contoso")
#df = df.drop("event_time","category_code","user_session","price","brand","category_id")
df = df.groupBy([df.ProductId, df.CustomerId]).count()
df.columns

StatementMeta(RetailSparkPool, 12, 115, Finished, Available)

['ProductId', 'CustomerId', 'count']

In [176]:
from pyspark.sql.types import IntegerType
df = df.withColumn("CustomerId", df["CustomerId"].cast(IntegerType()))
df = df.withColumn("ProductId", df["ProductId"].cast(IntegerType()))
df.printSchema()

StatementMeta(RetailSparkPool, 12, 116, Finished, Available)

root
 |-- ProductId: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- count: long (nullable = false)

In [177]:
df.show(5)

StatementMeta(RetailSparkPool, 12, 117, Finished, Available)

+---------+----------+-----+
|ProductId|CustomerId|count|
+---------+----------+-----+
|      125|       296|   11|
|       15|       296|   11|
|       54|       296|   22|
|      101|       296|   11|
|       29|       296|   11|
+---------+----------+-----+
only showing top 5 rows

In [178]:
#split the data into training and test datatset
train,test=df.randomSplit([0.75,0.25])
print("Training Count:")
train.count()

print("Test Count:")
test.count()

StatementMeta(RetailSparkPool, 12, 118, Finished, Available)

Training Count:
Test Count:


17863

#### Alternating least square (ALS) Recommender 

Apache Spark ML implements ALS for collaborative filtering, a very popular algorithm for making recommendations.

ALS recommender is a matrix factorization algorithm that uses Alternating Least Squares with Weighted-Lamda-Regularization (ALS-WR). It factors the user to item matrix A into the user-to-feature matrix U and the item-to-feature matrix M: It runs the ALS algorithm in a parallel fashion. The ALS algorithm should uncover the latent factors that explain the observed user to item ratings and tries to find optimal factor weights to minimize the least squares between predicted and actual ratings.

In [179]:
#import ALS recommender function from pyspark ml library
from pyspark.ml.recommendation import ALS

#Training the recommender model using train datatset
rec = ALS(maxIter=10,regParam=0.20,implicitPrefs = True, userCol='CustomerId',itemCol='ProductId',
            ratingCol='count',nonnegative=True,coldStartStrategy="drop", rank=25)

#fit the model on train set
rec_model = rec.fit(train)

#making predictions on test set 
predicted_ratings = rec_model.transform(test)

#columns in predicted ratings dataframe
predicted_ratings.printSchema()

StatementMeta(RetailSparkPool, 12, 119, Finished, Available)

root
 |-- ProductId: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- count: long (nullable = false)
 |-- prediction: float (nullable = false)

In [180]:
#predicted vs actual ratings for test set 
predicted_ratings.orderBy(rand()).show(10)

StatementMeta(RetailSparkPool, 12, 120, Finished, Available)

+---------+----------+-----+----------+
|ProductId|CustomerId|count|prediction|
+---------+----------+-----+----------+
|       24|       781|   10|0.63058364|
|       94|       144|   11|0.81831264|
|       95|       658|   11|0.68371904|
|      136|       265|   20|0.83226246|
|      126|        99|   11| 0.8410328|
|        4|       501|   33| 0.8276296|
|      134|       104|   22| 0.7550775|
|      102|       885|   11| 0.7919355|
|       26|        97|   11| 0.5906551|
|       85|       962|   11| 0.8203504|
+---------+----------+-----+----------+
only showing top 10 rows

In [181]:
predicted_ratings_witherr=predicted_ratings.withColumn('err',abs(predicted_ratings["prediction"] - predicted_ratings["count"]))
predicted_ratings_witherr.show(5)

StatementMeta(RetailSparkPool, 12, 121, Finished, Available)

+---------+----------+-----+----------+---------+
|ProductId|CustomerId|count|prediction|      err|
+---------+----------+-----+----------+---------+
|       31|       243|   22|  0.861669| 21.13833|
|       31|       623|   11| 0.7423455|10.257654|
|       31|       137|   11| 0.7694933|10.230507|
|       31|       796|   11| 0.8146975|10.185303|
|       31|       300|   33| 0.9350311| 32.06497|
+---------+----------+-----+----------+---------+
only showing top 5 rows

In [182]:
df.groupBy('count').count().orderBy('count',ascending=True).show(5)

StatementMeta(RetailSparkPool, 12, 122, Finished, Available)

+-----+-----+
|count|count|
+-----+-----+
|   10| 2317|
|   11|47068|
|   20|  778|
|   22|16629|
|   30|  179|
+-----+-----+
only showing top 5 rows

In [183]:
predicted_ratings_witherr.groupBy('count').agg({'err':'mean'}).orderBy('count',ascending=True).show(5)

StatementMeta(RetailSparkPool, 12, 123, Finished, Available)

+-----+------------------+
|count|          avg(err)|
+-----+------------------+
|   10| 9.202115451683433|
|   11| 10.18813630941823|
|   20|19.208826695013485|
|   22|21.189440337600413|
|   30|29.209436779930478|
+-----+------------------+
only showing top 5 rows

In [184]:
#importing Regression Evaluator to measure RMSE
from pyspark.ml.evaluation import RegressionEvaluator

#create Regressor evaluator object for measuring accuracy
evaluator=RegressionEvaluator(metricName='rmse',predictionCol='prediction',labelCol='count')

#apply the RE on predictions dataframe to calculate RMSE
rmse=evaluator.evaluate(predicted_ratings)
print(rmse)

StatementMeta(RetailSparkPool, 12, 124, Finished, Available)

15.985428794658388

In [185]:
# Save the model
rec_model.write().overwrite().save("retailai_recommendation_model")

StatementMeta(RetailSparkPool, 12, 125, Finished, Available)

###  Below modules can be used to compute recommendations for targeted customers from the above trained ALS model


In [186]:
# loading above trained recommendation model
from pyspark.ml.recommendation import ALSModel
model_reload = ALSModel.load("retailai_recommendation_model")
print(model_reload)

StatementMeta(RetailSparkPool, 12, 126, Finished, Available)

ALS_a67819b261ab

An item-based recommender model that computes cosine similarity for each item pairs using the item factors matrix generated by Spark MLlib’s ALS algorithm and recommends top 5 products based on the selected customers.



In [187]:
from pyspark.sql import Row
from pyspark.sql.functions import col
import numpy as np
from numpy import linalg as LA
from pyspark.ml.recommendation import ALSModel

class ItemBasedRecommender():

    def __init__(self, model, spark):
        self.model = model
        self.spark = spark
        self.itemFactors = self.model.itemFactors

    def compute_similarity(self, item_id):
        item = self.itemFactors.where(
            col('id') == item_id).select(col('features'))
        item_features = item.rdd.map(lambda x: x.features).first()

        lol = []
        for row in self.itemFactors.rdd.toLocalIterator():
            _id = row.__getattr__('id')
            features = row.__getattr__('features')
            similarity_score = self._cosine_similarity(features, item_features)
            if _id != item_id:
                lol.append([_id, similarity_score])

        R = Row('item_index', 'similarity_score')
        self.similar_items_df = self.spark.createDataFrame(
            [R(col[0], float(col[1])) for col in lol])
        self.similar_items_df = self.similar_items_df.orderBy(
            col('similarity_score').desc()).na.drop()
        return self.similar_items_df


    def _cosine_similarity(self, vector_1, vector_2):
        v1 = np.asarray(vector_1)
        v2 = np.asarray(vector_2)
        cs = v1.dot(v2) / (LA.norm(v1) * LA.norm(v2))
        return(cs)

StatementMeta(RetailSparkPool, 12, 127, Finished, Available)

### Input the CustomerIDs of the millennials to get the top 5 recommended products

In [240]:
# CustomerID as input. Note: this is the normalized CustomerID starting from 0.        
CustomerID = 24
item_based_rec = ItemBasedRecommender(model_reload, spark)
ret_df = item_based_rec.compute_similarity(CustomerID)

ret_df.show(5)

StatementMeta(RetailSparkPool, 12, 180, Finished, Available)

+----------+------------------+
|item_index|  similarity_score|
+----------+------------------+
|        13|0.6715755562542827|
|        25|0.6637896955347723|
|        75|0.6419629417589321|
|        18|0.6197625370885771|
|        77|0.6194915714535928|
+----------+------------------+
only showing top 5 rows

#### For WideWorldImporters, top 5 recommended products


In [241]:
df_product_wwi = df_product[df_product["EntityCode"] == 'WideWorldImporters']
rec_df = df_product_wwi.join(ret_df, (df_product_wwi["ProductId"]==ret_df["item_index"]))

rec_df[[ 'ProductCategories', 'ProductId', 'BrandName', 'EntityCode', "similarity_score"]].show(5)

StatementMeta(RetailSparkPool, 12, 181, Finished, Available)

+--------------------+---------+--------------------+------------------+------------------+
|   ProductCategories|ProductId|           BrandName|        EntityCode|  similarity_score|
+--------------------+---------+--------------------+------------------+------------------+
|            Clothing|       13|           Proseware|WideWorldImporters|0.6715755562542827|
|      Sporting Goods|       25|Cronus International|WideWorldImporters|0.6637896955347723|
|                Life|       75|              Cpandl|WideWorldImporters|0.6419629417589321|
|Menswear; Womenswear|       18|           Proseware|WideWorldImporters|0.6197625370885771|
|                Life|       77|              Cpandl|WideWorldImporters|0.6194915714535928|
+--------------------+---------+--------------------+------------------+------------------+
only showing top 5 rows