In [1]:
# Use the Azure Machine Learning data collector to log various metrics
from azureml.logging import get_azureml_logger
logger = get_azureml_logger()
import pprint
pp = pprint.PrettyPrinter(indent=4, depth=10)

In [4]:
import pyspark
import unittest
from mmlspark.SAR import SAR

from pyspark.ml.tuning import *
from pyspark.sql.types import *


# Create Sample Data

In [5]:
spark = pyspark.sql.SparkSession.builder.master("local[*]").getOrCreate()

cSchema = StructType([StructField("customerID", IntegerType()),
                      StructField("itemID", IntegerType()),
                      StructField("rating", IntegerType()),
                      StructField("notTime", IntegerType())])

ratings = spark.createDataFrame([
    (0, 1, 4, 4),
    (0, 3, 1, 1),
    (0, 4, 5, 5),
    (0, 5, 3, 3),
    (0, 7, 3, 3),
    (0, 9, 3, 3),
    (0, 0, 3, 3),
    (1, 1, 4, 4),
    (1, 2, 5, 5),
    (1, 3, 1, 1),
    (1, 6, 4, 4),
    (1, 7, 5, 5),
    (1, 8, 1, 1),
    (1, 0, 3, 3),
    (2, 1, 4, 4),
    (2, 2, 1, 1),
    (2, 3, 1, 1),
    (2, 4, 5, 5),
    (2, 5, 3, 3),
    (2, 6, 4, 4),
    (2, 8, 1, 1),
    (2, 9, 5, 5),
    (2, 0, 3, 3),
    (3, 2, 5, 5),
    (3, 3, 1, 1),
    (3, 4, 5, 5),
    (3, 5, 3, 3),
    (3, 6, 4, 4),
    (3, 7, 5, 5),
    (3, 8, 1, 1),
    (3, 9, 5, 5),
    (3, 0, 3, 3)], cSchema)

pp.pprint(ratings.collect())

[   Row(customerID=0, itemID=1, rating=4, notTime=4),
    Row(customerID=0, itemID=3, rating=1, notTime=1),
    Row(customerID=0, itemID=4, rating=5, notTime=5),
    Row(customerID=0, itemID=5, rating=3, notTime=3),
    Row(customerID=0, itemID=7, rating=3, notTime=3),
    Row(customerID=0, itemID=9, rating=3, notTime=3),
    Row(customerID=0, itemID=0, rating=3, notTime=3),
    Row(customerID=1, itemID=1, rating=4, notTime=4),
    Row(customerID=1, itemID=2, rating=5, notTime=5),
    Row(customerID=1, itemID=3, rating=1, notTime=1),
    Row(customerID=1, itemID=6, rating=4, notTime=4),
    Row(customerID=1, itemID=7, rating=5, notTime=5),
    Row(customerID=1, itemID=8, rating=1, notTime=1),
    Row(customerID=1, itemID=0, rating=3, notTime=3),
    Row(customerID=2, itemID=1, rating=4, notTime=4),
    Row(customerID=2, itemID=2, rating=1, notTime=1),
    Row(customerID=2, itemID=3, rating=1, notTime=1),
    Row(customerID=2, itemID=4, rating=5, notTime=5),
    Row(customerID=2, itemID

# Train SAR Model

In [6]:
sar = SAR() \
    .setUserCol("customerID") \
    .setRatingCol("rating") \
    .setItemCol("itemID") \
    .setSupportThreshold(2)\

sarModel = sar.fit(ratings)
usersRecs = sarModel.recommendForAllUsers(3)
pp.pprint(usersRecs.collect())

[   Row(customerID=1, recommendations=[Row(itemID=0, rating=3.0), Row(itemID=3, rating=3.0), Row(itemID=1, rating=2.25)]),
    Row(customerID=3, recommendations=[Row(itemID=0, rating=3.0), Row(itemID=3, rating=3.0), Row(itemID=1, rating=2.25)]),
    Row(customerID=2, recommendations=[Row(itemID=0, rating=3.0), Row(itemID=3, rating=3.0), Row(itemID=1, rating=2.25)]),
    Row(customerID=0, recommendations=[Row(itemID=0, rating=3.0), Row(itemID=3, rating=3.0), Row(itemID=1, rating=2.25)])]


# Evaluate with Metrics

In [7]:
import pandas as pd
import pyspark
import unittest
from mmlspark.TrainTestSplit import *
from mmlspark.evaluate import *
from pyspark.ml.tuning import *
from pyspark.sql.functions import col
from pyspark.sql.types import *

In [8]:
pyspark.sql.DataFrame.min_rating_filter = TrainTestSplit.min_rating_filter
pyspark.sql.DataFrame.stratified_split = TrainTestSplit.stratified_split

In [9]:
dfs_train, dfs_test = ratings\
                        .min_rating_filter(min_rating=1, by_customer=True)\
                        .stratified_split(min_rating=1, by_customer=True, fixed_test_sample=False, ratio=0.5)

scored_data = sar.fit(dfs_train).transform(dfs_test)

rating_true = scored_data.select("customerID", "itemID", "rating")
rating_pred = scored_data.select("customerID", "itemID", "prediction").withColumnRenamed("prediction", "rating")

In [10]:
k = 5

In [None]:
evaluator_ranking = RankingEvaluation(k, rating_true, rating_pred)
recall = evaluator_ranking.recall_at_k()
precision = evaluator_ranking.precision_at_k()
ndcg = evaluator_ranking.ndcg_at_k()
map = evaluator_ranking.map_at_k()

print("Recall: " + str(recall))
print("precision: " + str(precision))
print("ndcg: " + str(ndcg))
print("map: " + str(map))

Recall: 1.0
precision: 0.8
ndcg: 1.0
map: 1.0


In [None]:
evaluator_distribution = DistributionMetrics(k, rating_true, rating_pred)

print(evaluator_distribution.popularity_at_k().head(1))

diversity = evaluator_distribution.diversity_at_k()
max_diversity = evaluator_distribution.max_diversity()

print("diversity: " + str(diversity))
print("max_diversity: " + str(max_diversity))

In [None]:
evaluator_rating = RatingEvaluation(rating_true, rating_pred)

rsquared = evaluator_rating.rsquared
exp_var = evaluator_rating.exp_var
mae = evaluator_rating.mae
rmse = evaluator_rating.rmse

print("rsquared: " + str(rsquared))
print("exp_var: " + str(exp_var))
print("mae: " + str(mae))
print("rmse: " + str(rmse))