In [ ]:
%%configure -f
{
"conf": {
    "spark.sql.autoBroadcastJoinThreshold": -1,
    "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
    "spark.dynamicAllocation.enabled": true,
    "spark.dynamicAllocation.minExecutors": 2,
    "spark.dynamicAllocation.maxExecutors": 8
   }
}

In [ ]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.ml.functions import vector_to_array
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
import joblib
from io import BytesIO
import h2o
from h2o.estimators.extended_isolation_forest import H2OExtendedIsolationForestEstimator as ExtendedIsolationForest
import shutil

In [ ]:
batch_id = ''
prepped_data_path = ''
iFor_data_prefix = ''
subsample_list = ''
trees_list = ''
train_size = ''
id_feat = ''
seed = ''
time_slice_folder = ''
extension_level = ''

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
config_integration.trace_integrations(['logging'])

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'batch_id': batch_id,
    'prepped_data_path': prepped_data_path,
    'iFor_data_prefix': iFor_data_prefix,
    'subsample_list': subsample_list,
    'trees_list': trees_list,
    'train_size': train_size,
    'id_feat': id_feat,
    'seed': seed,
    'time_slice_folder': time_slice_folder,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
if prepped_data_path != "":
    prepped_data_path = "/".join(prepped_data_path.split("/")[:-1]) + "/" + time_slice_folder + "/" + prepped_data_path.split("/")[-1]
    logger.info(f'prepped_data_path = {prepped_data_path}')
if iFor_data_prefix != "":
    iFor_data_prefix = "/".join(iFor_data_prefix.split("/")[:-1]) + "/" + time_slice_folder + "/" + iFor_data_prefix.split("/")[-1]
    logger.info(f'iFor_data_prefix = {iFor_data_prefix}')

In [ ]:
# Cast parameters
subsample_list = [int(i) for i in subsample_list.split(",")]
trees_list = [int(i) for i in trees_list.split(",")]
train_size = float(train_size)
id_feat = [i for i in id_feat.split(",")]
seed = int(seed)
extension_level = int(extension_level)

In [ ]:
max_subsample_size = max(subsample_list)
w = Window().orderBy(F.lit('A'))

In [ ]:
df = spark.read.parquet(prepped_data_path)
m = df.count()

In [ ]:
num_groups = int(np.ceil(m*train_size/max_subsample_size))

In [ ]:
# Add id to join with group table
df_id = df.withColumn('_id',F.row_number().over(w))

In [ ]:
def ijungle_train(id_feat, seed, time_slice, ex_level):
    def _fun(key, pdf):
        trees = key[0]
        subsample_size = key[1]
        group = key[2]
        pdf.set_index(id_feat, inplace=True)
        feats = list(pdf.columns)
        feats.remove('_group')
        feats.remove('_tree_size')
        feats.remove('_subsample_size')
        pdf = pdf[feats]
        if ex_level < 0:
            clf = IsolationForest(
                n_estimators = trees, 
                max_samples=min(subsample_size, pdf.shape[0]), 
                random_state=seed, n_jobs=-1)
            clf.fit(pdf)
            bytes_container = BytesIO()
            joblib.dump(clf, bytes_container)
            bytes_container.seek(0)
            model_bytes = bytes_container.read()
            return(pd.DataFrame([(trees, subsample_size, group, model_bytes)]))
        else:
            h2o.init()
            clf = ExtendedIsolationForest(
                ntrees= int(trees), 
                sample_size=int(min(subsample_size, pdf.shape[0])), 
                seed=int(seed), 
                extension_level=int(ex_level))
            hf = h2o.H2OFrame(pdf)
            clf.train(training_frame=hf)
            model_path = "/tmp/ijungle_{}_{}_{}_{}".format(time_slice,trees,subsample_size,group)
            model_filename = h2o.save_model(model=clf, path=model_path, force=True)
            with open(model_filename, 'rb') as model_file:
                file_bytes = model_file.read()
            shutil.rmtree(model_path)
            return(pd.DataFrame([(trees, subsample_size, group, file_bytes)]))
    return(_fun)

In [ ]:
df_unassembled_exists = False
for trees in trees_list:
    for subsample_size in subsample_list:
        # Random selection of records in groups of subsample size
        group_array = np.array([])
        for group in range(num_groups):
            group_array = np.concatenate([group_array, group * np.ones(subsample_size)])

        group_array = np.concatenate([group_array, -1*np.ones(m-(num_groups*subsample_size))])

        np.random.shuffle(group_array)

        pdf_id_group = pd.DataFrame(group_array, columns=['_group']).reset_index()
        pdf_id_group.columns = ['_id', '_group']

        df_id_group = spark.createDataFrame(pdf_id_group)

        # Join of random selection of groups with training data
        df_subsamples = df_id.join(df_id_group, on='_id').where(F.col('_group')>=0).select(id_feat+['scaled','_group'])
        df_subsamples = df_subsamples.withColumn("_tree_size",F.lit(trees)).withColumn("_subsample_size",F.lit(subsample_size)) 
        df_subsamples = df_subsamples.cache()

        # Vector to individual columns to prepare for parallel training
        num_feats = len(df_subsamples.head(1)[0]['scaled'])
        if df_unassembled_exists:
            df_unassembled = df_unassembled.union(df_subsamples.withColumn('f', vector_to_array("scaled")).select(id_feat + ['_tree_size','_subsample_size','_group'] + [F.col("f")[i] for i in range(num_feats)]))
        else:
            df_unassembled = df_subsamples.withColumn('f', vector_to_array("scaled")).select(id_feat + ['_tree_size','_subsample_size','_group'] + [F.col("f")[i] for i in range(num_feats)])
            df_unassembled_exists = True

In [ ]:
df_iFor = df_unassembled.groupBy('_tree_size', '_subsample_size', '_group').applyInPandas(
    ijungle_train(id_feat, seed, time_slice_folder, extension_level), 
    schema="tree_size long, subsample_size long, id long, model binary"
)
df_iFor.write.mode('overwrite').parquet(iFor_data_prefix)