In [None]:
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

## Recommendations

Load the test datasets and apply the model to it. The resulting dataset contains two columns indicating whether the user will click on an article and the corresponding probability.

1. Define variables
2. Load the datasets, feature processor and model
3. Apply model to dataset
4. Cleanup results and store model
5. Sample queries



In [None]:
from pyspark.ml import PipelineModel, Pipeline
from pyspark.sql import functions as F
from pyspark.sql import types as T
from mmlspark.lightgbm import LightGBMClassifier, LightGBMClassificationModel

## Define variables 


In [None]:
#define general variables
name_dataset_recommendation =  'default.Recommendations'
model_name = "news_recommendation_model.mml"
feature_processor_name = 'feature_proprecssor.mml'
col_user = 'User_ID'
col_item = 'Article_ID'
dataset_test = 'default.activitytest'


## Load dataset, feature processor and model


In [None]:
# Read testset
df_test  = spark.read.table(dataset_test)

In [None]:
# Load Model
model = LightGBMClassificationModel.load(model_name)

# Load Feature processor
feature_processor = PipelineModel.load(feature_processor_name)

## Apply feature processor and model to dataset


In [None]:
# Apply feature processor to test data
df_feature = feature_processor.transform(df_test)

# Apply model to feature data
df_recommendations = model.transform(df_feature)

## Clean up results

For the probabilities, the model returns a column with lists containing the probability of clicking and no-clicking on an article. In this case, the probability of clicking is extracted and stored in a new column

In [None]:
#post-processing
udf_prob = F.udf(lambda x: float(x[1]), T.FloatType())
df_output = df_recommendations.select(
    'User_ID','News_ID','Category','Subcategory','Title','Abstract',
    F.col('prediction').alias('ClickPrediction'), 
    udf_prob('probability').alias('ClickProbability')
    )
    
df_output.write.mode('overwrite').saveAsTable('default.userRecommendations')

In [None]:
# get the user activity
df_all_act = spark.sql('''
select User_id, News_id, Category, SubCategory, Title, Abstract, to_timestamp(Time, "MM/dd/yyyy hh:mm:ss aa") as ActivityTime from default.ActivityTrain 
where User_id in (select distinct User_id from default.ActivityTest)
''')
df_all_act.write.mode('overwrite').saveAsTable('default.userAllActivity')

In [None]:
# get the user past activity and recommendations into one table
df_all_act = spark.sql('''(select 'History' as ActType, User_id, News_id, Category, SubCategory, Title, 
Abstract, 0 as ClickPrediction, 0 as ClickProbability, ActivityTime from default.userAllActivity)
UNION ALL
(select 'Recommendations' as ActType, User_id, News_id, Category, SubCategory, Title, Abstract, 
ClickPrediction,ClickProbability, '9999-12-31 24:59:59' as ActivityTime from default.userRecommendations
where user_id in (select distinct User_id from default.userAllActivity))
''')

df_all_act.write.mode('overwrite').saveAsTable('default.user_History_Recommendations')

## Sample Queries
1. Count total recommendations
2. Count unique users with recommendations
3. Recommendations for a specific user and category

In [None]:
%%sql

select count(*) from default.user_History_Recommendations

In [None]:
%%sql

select count(distinct User_id) from default.user_History_Recommendations

In [None]:
%%sql 
Select * from default.user_History_Recommendations where User_ID = '66428' and Category = 'news' order by ActType, ClickProbability desc 