Copyright (c) Microsoft Corporation. 
Licensed under the MIT license. 
# ML Model Building

Pre-process data and use the data to build a Spark machine learning model in this notebook using the following steps:

1. Training-test split
1. Data pre-processing (one-hot encoding, vectorizor)
1. Build machine learning model
1. Calculate model performance metrics
1. Extract model feature importances
1. Save results to data lake

## Library Imports


In [None]:
import pyspark
spark = pyspark.sql.SparkSession.builder.appName("MyApp") \
               .config("spark.jars.packages", "com.microsoft.ml.spark:mmlspark_2.11:1.0.0-rc1") \
               .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") \
               .getOrCreate()
from mmlspark.lightgbm import *
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql import DataFrame
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import *
from pyspark.ml.evaluation import *
from pyspark.ml.classification import *
import pandas as pd
import numpy as np
spark.conf.set('spark.sql.execution.arrow.enabled', False)

## Read In Data From Delta Lake


In [None]:
data_lake_account_name = ''
file_system_name = ''

In [None]:
df = spark.read.format("delta").load(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/transformed_data/ml_data")

## Train-Test Split
Split data into a 70-30 training-test split

In [None]:
(trainDF, testDF) = df.randomSplit([.7, .3], seed = 123)

## ML Pre-Processing & Model Building
1. Pre-process data by encoding categorical columns and assembling them into a vector format expected for model building.
2. Build a Spark pipeline binary classifier model to predict growth using LightGBM
3. Use this model to score the test dataset to get model performance metrics

In [None]:
# Target column (label)
target_col = 'growth'

# ID columns
id_col_1 = 'user_id'
id_col_2 = 'year'
id_col_3 = 'month'

# Separate into Categorical, Target, and Numeric Columns

# Create categorical column list with all of the columns that contain int and string values
categorical_cols = ['brand_apple_purchased_binary', 'brand_samsung_purchased_binary', 'brand_xiaomi_purchased_binary', 
                    'brand_huawei_purchased_binary', 'brand_acer_purchased_binary', 'subcategory_smartphone_purchased_binary', 
                    'subcategory_audio_purchased_binary', 'subcategory_clocks_purchased_binary', 
                    'subcategory_tablet_purchased_binary', 'subcategory_telephone_purchased_binary', 
                    'product_id_1004856_purchased_binary', 'product_id_1004767_purchased_binary', 
                    'product_id_1005115_purchased_binary', 'product_id_4804056_purchased_binary', 'product_id_1004833_purchased_binary']

numeric_cols = ['sessions_per_user_per_month', 'avg_session_duration_per_user_per_month', 'avg_conversion_rate_per_user_per_month',
                'avg_order_value_per_user_per_month', 'avg_cart_abandon_rate']

stages = [] # stages in our Pipeline

# Category Indexing with StringIndexer - Use OneHotEncoder to convert categorical variables into binary SparseVectors
string_indexes = [StringIndexer(inputCol = c, outputCol = 'idx_' + c, handleInvalid = 'keep') for c in categorical_cols]
onehot_indexes = [OneHotEncoderEstimator(inputCols = ['idx_' + c], outputCols = ['ohe_' + c]) for c in categorical_cols]
stages += string_indexes + onehot_indexes

# Transform all numeric features into a vector using VectorAssembler
assembler_inputs = ['ohe_' + c for c in categorical_cols] + numeric_cols
assembler = VectorAssembler(inputCols = assembler_inputs, outputCol = 'features', handleInvalid = 'keep')
stages += [assembler]

# Create an indexed label from your target variable
label_string_idx = StringIndexer(inputCol = target_col, outputCol = 'label', handleInvalid = 'keep')
stages += [label_string_idx]

# Set a random seed variable for reproducibility
random_seed_val = 12345

# Light GBM Classifier
lgbm = LightGBMClassifier(learningRate = 0.1, numIterations = 100, numLeaves = 50)
stages += [lgbm]

lgbmPipeline = Pipeline(stages = stages)
lgbmPipelineModel = lgbmPipeline.fit(trainDF)
lgbmDF = lgbmPipelineModel.transform(testDF)

## Model Performance Metrics
Calculate classification model metrics using the test dataset

In [None]:
mce = MulticlassClassificationEvaluator()
bce = BinaryClassificationEvaluator()

accuracy = mce.setMetricName('accuracy').evaluate(lgbmDF)
precision = mce.setMetricName('weightedPrecision').evaluate(lgbmDF)
recall = mce.setMetricName('weightedRecall').evaluate(lgbmDF)
f1 = mce.setMetricName('f1').evaluate(lgbmDF)
auc = bce.setMetricName('areaUnderROC').evaluate(lgbmDF)

# model metrics df
model_metrics = spark.createDataFrame(
    [
        ('Accuracy', f'{accuracy:.2f}'),
        ('Precision', f'{precision:.2f}'),
        ('Recall', f'{recall:.2f}'),
        ('F1 Score', f'{f1:.2f}'),
        ('AUC', f'{auc:.2f}'),
    ],
    ['Metric', 'Value']
)

In [None]:
display(model_metrics)

## Feature Importances
Use the model feature importances to determine the top revenue growth factors and their relative importances

In [None]:
# Custom function to extract feature names and importance - partly borrowed from https://gist.github.com/timlrx/1d5fdb0a43adbbe32a9336ba5c85b1b2#file-featureimportanceselector-py
def ExtractFeatureImp(featureImp, df, featuresCol):
    list_extract = []
    for i in df.schema[featuresCol].metadata['ml_attr']['attrs']:
        list_extract = list_extract + df.schema[featuresCol].metadata['ml_attr']['attrs'][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))
  
varlist = ExtractFeatureImp(lgbmPipelineModel.stages[-1].getFeatureImportances(), lgbmDF, 'features')

# important features df
important_features = spark.createDataFrame(varlist)
important_features = important_features.drop('idx')

In [None]:
display(important_features)

## Save Results to Data Lake
Persist the model results to Delta tables on the Data Lake

In [None]:
important_features.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/important_features/important_features")
model_metrics.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/model_metrics/model_metrics")