In [ ]:
%%configure -f
{
"conf": {
    "spark.sql.autoBroadcastJoinThreshold": -1,
    "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
    "spark.dynamicAllocation.enabled": true,
    "spark.dynamicAllocation.minExecutors": 2,
    "spark.dynamicAllocation.maxExecutors": 8
   }
}

In [ ]:
import pyspark.sql.functions as F

In [ ]:
batch_id = ''
iFor_data_prefix = ''
overhead_data_path = ''
overhead_results_prefix = ''
best_iforest_path = ''
subsample_list = ''
trees_list = ''
id_feat = ''
time_slice_folder = ''

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
config_integration.trace_integrations(['logging'])

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'batch_id': batch_id,
    'iFor_data_prefix': iFor_data_prefix,
    'overhead_data_path': overhead_data_path,
    'overhead_results_prefix': overhead_results_prefix,
    'best_iforest_path': best_iforest_path,
    'subsample_list': subsample_list,
    'trees_list': trees_list,
    'id_feat': id_feat,
    'time_slice_folder': time_slice_folder,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
if iFor_data_prefix != "":
    iFor_data_prefix = "/".join(iFor_data_prefix.split("/")[:-1]) + "/" + time_slice_folder + "/" + iFor_data_prefix.split("/")[-1]
    logger.info(f'iFor_data_prefix = {iFor_data_prefix}')
if overhead_data_path != "":
    overhead_data_path = "/".join(overhead_data_path.split("/")[:-1]) + "/" + time_slice_folder + "/" + overhead_data_path.split("/")[-1]
    logger.info(f'overhead_data_path = {overhead_data_path}')
if overhead_results_prefix != "":
    overhead_results_prefix = "/".join(overhead_results_prefix.split("/")[:-1]) + "/" + time_slice_folder + "/" + overhead_results_prefix.split("/")[-1]
    logger.info(f'overhead_results_prefix = {overhead_results_prefix}')
if best_iforest_path != "":
    best_iforest_path = "/".join(best_iforest_path.split("/")[:-1]) + "/" + time_slice_folder + "/" + best_iforest_path.split("/")[-1]
    logger.info(f'best_iforest_path = {best_iforest_path}')

In [ ]:
# Casting parameters
subsample_list = [int(i) for i in subsample_list.split(",")]
trees_list = [int(i) for i in trees_list.split(",")]
id_feat = [i for i in id_feat.split(",")]

In [ ]:
df_predict = spark.read.parquet(overhead_results_prefix)
df_avg = df_predict.groupBy(id_feat).agg((F.sum('predict')/F.count('predict')).alias('avg'))
df_predict = df_predict.join(df_avg,on=id_feat)
df_predict = df_predict.withColumn("squared_residuals",F.pow(F.col('predict') - F.col("avg"),2))
df_model = df_predict.groupBy("tree_size","subsample_size","group_num").agg(F.sum("squared_residuals").alias("sum_of_squared_residuals"))
df_model = df_model.orderBy("sum_of_squared_residuals",ascending=True)
best_trees, best_subsample_size, best_group, _ = df_model.head(1)[0]
logger.info("Best iForest: {}, {}, {}".format(best_trees, best_subsample_size, best_group))

In [ ]:
df_iFor = spark.read.parquet(iFor_data_prefix)
model_bytes = df_iFor.where((F.col('id')==best_group)&(F.col('tree_size')==best_trees)&(F.col('subsample_size')==best_subsample_size)).select('model').collect()[0]['model']

In [ ]:
spark.createDataFrame([('best_iforest',model_bytes)],schema=['id','model']).write.mode('overwrite').parquet(best_iforest_path)