In [ ]:
%%configure -f
{
"conf": {
    "spark.sql.autoBroadcastJoinThreshold": -1,
    "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
    "spark.dynamicAllocation.enabled": true,
    "spark.dynamicAllocation.minExecutors": 2,
    "spark.dynamicAllocation.maxExecutors": 8
   }
}

In [ ]:
import joblib
from pyspark.ml.functions import vector_to_array
from pyspark.ml.feature import IndexToString, StringIndexerModel
import pyspark.sql.functions as F
import pandas as pd
from io import BytesIO
import numpy as np
import h2o
import os

In [ ]:
batch_id = ''
best_iforest_path = ''
prepped_data_path = ''
results_path = ''
id_feat = ''
id_feat_types = ''
time_slice_folder = ''
model_path = ''
extension_level = ''

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
config_integration.trace_integrations(['logging'])

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'batch_id': batch_id,
    'best_iforest_path': best_iforest_path,
    'prepped_data_path': prepped_data_path,
    'results_path': results_path,
    'id_feat': id_feat,
    'id_feat_types': id_feat_types,
    'time_slice_folder': time_slice_folder,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
if best_iforest_path != "":
    best_iforest_path = "/".join(best_iforest_path.split("/")[:-1]) + "/" + time_slice_folder + "/" + best_iforest_path.split("/")[-1]
    logger.info(f'best_iforest_path = {best_iforest_path}')
if prepped_data_path != "":
    prepped_data_path = "/".join(prepped_data_path.split("/")[:-1]) + "/" + time_slice_folder + "/" + prepped_data_path.split("/")[-1]
    logger.info(f'prepped_data_path = {prepped_data_path}')
if results_path != "":
    results_path = results_path + "/" + time_slice_folder
    logger.info(f'results_path = {results_path}')

In [ ]:
# Casting parameters
id_feat = [i for i in id_feat.split(",")]
id_feat_types = [i for i in id_feat_types.split(",")]
extension_level = int(extension_level)

In [ ]:
model_bytes = spark.read.parquet(best_iforest_path).head(1)[0]['model']
if extension_level < 0:
    clf = joblib.load(BytesIO(model_bytes))
else:
    clf = model_bytes

In [ ]:
df = spark.read.parquet(prepped_data_path)
logger.info("Number of records: {:,}".format(df.count()))

In [ ]:
num_feats = len(df.head(1)[0]['scaled'])
logger.info(f'Number of features: {str(num_feats)}')
df_unassembled = df.withColumn('f', vector_to_array("scaled")).select(id_feat + [F.col("f")[i] for i in range(num_feats)])
logger.info("Number of records of inference dataset: {:,}".format(df_unassembled.count()))

In [ ]:
def ijungle_predict(id_feat, clf, ex_level, time_slice):
    def _fun(iterator):
        if ex_level >= 0:
            model_filename = "/tmp/ijungle_{}".format(time_slice)
            with open(model_filename, 'wb') as model_file:
                model_file.write(clf)
            h2o.init()
            saved_model = h2o.load_model(model_filename)
            os.remove(model_filename)
        for pdf in iterator:
            pdf.set_index(id_feat, inplace=True)
            if ex_level < 0:
                _predict = clf.predict(pdf)
                _score = clf.score_samples(pdf)
            else:
                #Factor of -1 to align with sklearn formalism
                hf = h2o.H2OFrame(pdf)
                _score = saved_model.predict(hf)
                _score = -1.0*_score['anomaly_score'].as_data_frame().to_numpy().reshape(-1)
                _predict = np.where(_score < -0.5, -1, 1)
            pdf.reset_index(drop=False, inplace=True)
            pdf_out = pd.DataFrame()
            pdf_out[id_feat] = pdf[id_feat]
            pdf_out['predict'] = _predict
            pdf_out['score'] = _score
            yield(pdf_out)
    return(_fun)

In [ ]:
dcc_str = ", ".join([x[0]+" "+x[1] for x in zip(id_feat, id_feat_types)]) + ", predict int, score float"
df_results = df_unassembled.mapInPandas(ijungle_predict(id_feat, clf, extension_level, time_slice_folder),dcc_str)
model = StringIndexerModel.load("/".join(best_iforest_path.split("/")[:-2]) + '/' + '_feature_engineering_indexer_issuer_id.pkl')
inverter = IndexToString(inputCol="issuer_id_indexed", outputCol="issuer_id", labels=model.labels)
df_results = inverter.transform(df_results)
#May need to add casting to ensure issuer_id is a string
#df_results = df_results.withColumn

In [ ]:
df_results.write.mode('overwrite').parquet(results_path)

In [ ]:
# serverless SQL config
import pyodbc
database = 'eiad'
driver= '{ODBC Driver 17 for SQL Server}'

sql_user_name = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLUserName")
sql_user_pwd = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLPassword")
serverless_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SyanpseServerlessSQLEndpoint")

In [ ]:
def generate_schema_string(dataframe):
    schema_string = ""
    for name in dataframe.schema.fieldNames():
        schema_string += "[" + name + "] "
        datatype = str(dataframe.schema[name].dataType.simpleString())
        if datatype == 'double': datatype = 'float'
        if datatype == 'string': datatype = 'nvarchar(MAX)'
        if datatype == 'timestamp': datatype = 'datetime2(7)'
        schema_string += datatype + ", "
    return schema_string[:-2]

In [ ]:
with tracer.span('Creating SQL table for anomaly detection results'):
    table_name = results_path.split('/')[3] + '_' + results_path.split('/')[2].split('@')[0] + '_' + results_path.split('/')[4] + '_' + results_path.split('/')[5]
    schema_string = generate_schema_string(df_results)
    drop_table_command = f"DROP EXTERNAL TABLE [{table_name}]"
    location = "/".join([i for idx, i in enumerate(results_path.split('/')) if idx > 2])
    df_sql_command = f"CREATE EXTERNAL TABLE [{table_name}] ({schema_string}) WITH (LOCATION = '{location}/**', DATA_SOURCE = [output_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], FILE_FORMAT = [SynapseParquetFormat])"
    with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
        with conn.cursor() as cursor:
            try:
                cursor.execute(drop_table_command)
            except:
                pass
            cursor.execute(df_sql_command)