In [None]:
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

## Model training
Train a prediction model that indicites whether a user will click on an article or not, compute performance metrics on dev data, and store the model for recommendation data

1. Define variables
2. Load the datasets
3. Create feature engineering transformer
4. Train model
5. Test model
6. Store model and transformer



In [None]:
import pyspark
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import FeatureHasher, Word2Vec, Tokenizer,  OneHotEncoderEstimator, StopWordsRemover, VectorAssembler, StringIndexer, HashingTF, IDF

from mmlspark.train import ComputeModelStatistics
from mmlspark.lightgbm import LightGBMClassifier


from pyspark.ml.evaluation import  MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql import types
import re

## Define variables (input folder, datasets, ....)


In [None]:
# Setup global variables
dataset_train =  'default.activitytrain'
dataset_dev = 'default.activitydev'
col_user = 'User_ID'
col_item = 'Article_ID'
col_target = 'Clicked'
feature_processor_name = 'feature_proprecssor.mml'
model_name = "news_recommendation_model.mml"

## Load Train and Dev Datasets

The training dataset set is used to train the model and the dev dataset is used to assess the performance of the model on unseen data

In [None]:
# Read dataset
df_train  = spark.read.table(dataset_train)
df_dev  = spark.read.table(dataset_dev)

## Feature Engineering

Here, the code applies different transformations depending on the column and type of data it contains:

- _Category/SubCategory_: convert text to integer. For instance, Word A becomes 1 and Word B becomes 2

- _Title/Abstract_:  first, sentences are tokenised into a list of words. For instance, "first day of week" becomes \[first, day, of, week]. Then stop words (a, the, of, etc) are removed, then only the most relevant words are kept using TF/IDF transformation

- _History_: each user has a different history of viewed articles. Here again TF/IDF is applied to keep only the most relevant articles. 

In [None]:
# Feature Engineering

# convert strings to integers
strindexer1 = StringIndexer(inputCol="Category", outputCol="Category_Idx", handleInvalid='keep')
strindexer2 = StringIndexer(inputCol="SubCategory", outputCol="SubCategory_Idx", handleInvalid='keep')

# Process and filter title and abstract information
tokenizer_title = Tokenizer(inputCol="Title", outputCol="Title_tokens")
StopWordsRemover_title = StopWordsRemover(inputCol=tokenizer_title.getOutputCol(), outputCol="title_filtered")
hashingTF_title = HashingTF(inputCol=StopWordsRemover_title.getOutputCol(), outputCol="hashed_title", numFeatures=10)
idf_title = IDF(inputCol=hashingTF_title.getOutputCol(), outputCol="features_title")

tokenizer_abstract = Tokenizer(inputCol="Abstract", outputCol="Abstract_tokens")
StopWordsRemover_abstract= StopWordsRemover(inputCol=tokenizer_title.getOutputCol(), outputCol="abstract_filtered")
hashingTF_abstract = HashingTF(inputCol=StopWordsRemover_abstract.getOutputCol(), outputCol="hashed_abstract", numFeatures=10)
idf_abstract = IDF(inputCol=hashingTF_abstract.getOutputCol(), outputCol="features_abstract")

# Apply tf-idf on viewed articles
hashingTF_history = HashingTF(inputCol='History', outputCol="hashed_history", numFeatures=100)
idf_history = IDF(inputCol=hashingTF_history.getOutputCol(), outputCol="feature_history")

# Assemble the output of each transformer into a single of lists
assembler = VectorAssembler(
    inputCols=[
        strindexer1.getOutputCol(),
        strindexer2.getOutputCol(),
        idf_title.getOutputCol(),
        idf_abstract.getOutputCol(),
        idf_history.getOutputCol(),
        ], 
        outputCol="features"
        )

# Create the whole feature processor
feature_processor = Pipeline(
    stages=[
        strindexer1,
        strindexer2,
        tokenizer_title,
        StopWordsRemover_title,
        hashingTF_title,
        idf_title,
        tokenizer_abstract,
        StopWordsRemover_abstract,
        hashingTF_abstract,
        idf_abstract,
        hashingTF_history,
        idf_history,
        assembler       
        ])

In [None]:
# Fit the feature transformers
fitted_processor = feature_processor.fit(df_train)
df_train_feature = fitted_processor.transform(df_train)
df_dev_feature = fitted_processor.transform(df_dev)

## Define and Train the model


In [None]:
#Instantiate model

# Setup Hyperparameters
NUM_LEAVES = 32
NUM_ITERATIONS = 50
LEARNING_RATE = 0.1
FEATURE_FRACTION = 0.8
EARLY_STOPPING_ROUND = 10

lgbm = LightGBMClassifier(
    labelCol=col_target,
    featuresCol="features",
    probabilityCol='probability', 
    objective="binary",
    isUnbalance=True,
    boostingType="gbdt",
    boostFromAverage=True,
    baggingSeed=42,
    numLeaves=NUM_LEAVES,
    numIterations=NUM_ITERATIONS,
    learningRate=LEARNING_RATE,
    featureFraction=FEATURE_FRACTION,
    earlyStoppingRound=EARLY_STOPPING_ROUND
)

In [None]:
# train model
model = lgbm.fit(df_train_feature)

## Evaluate the model on dev dataset

The metrics used to assess model performances are Precision, Recall, F1, and Accuracy.

In [None]:
def evaluate_model(model,df_test):
    predictions = model.transform(df_test)

    #drops nan in prediction
    predictions = predictions.dropna()

    #convert prediction probability
    predictions = predictions.withColumn('prediction', F.round('prediction').cast(types.DoubleType()))
    evaluatorMulti = MulticlassClassificationEvaluator(labelCol=col_target, predictionCol="prediction")
    evaluator = BinaryClassificationEvaluator(labelCol=col_target, rawPredictionCol="prediction")

    f1 = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "f1"})
    weightedPrecision = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "weightedPrecision"})
    weightedRecall = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "weightedRecall"})

    auc = evaluator.evaluate(predictions)
    return f1, weightedPrecision, weightedRecall, auc

In [None]:
# print accuracy, F1 score, precision and recall
f1, weightedPrecision, weightedRecall, auc = evaluate_model(model,df_dev_feature)
print('DEV AUC:', auc)
print('DEV F1:', f1)
print('DEV Precision:', weightedPrecision, 'DEV Recall:',weightedRecall)


## Store fitted feature processor and model


In [None]:
# store model
fitted_processor.write().overwrite().save(feature_processor_name)
model.write().overwrite().save(model_name)
