# iJungle Tutorial Training Pipeline Example

*TODO: Summary of the iJungle technique* 


In [1]:
import iJungle
from azureml.core import Workspace, Datastore, Dataset, Experiment, Environment, ScriptRunConfig
import pandas as pd
import os
from azureml.core.compute import  ComputeTarget, AmlCompute
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice
from azureml.core.runconfig import RunConfiguration
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep, HyperDriveStep, HyperDriveStepRun
from azureml.data import OutputFileDatasetConfig

print("iJungle version:", iJungle.__version__)

iJungle version: 0.1.73


# 1. Parameters definition

In [2]:
cluster_name = "cluster4"
environment_name = "ijungle-training-env"
working_datastore_name = "workspaceblobstore"
training_dataset_name = "ijungle-trainining-dataset"
test_dataset_name = "ijungle-test-dataset"
y_test_dataset_name = "ijungle-y-test-dataset"
index_feature = 'index'
pipeline_name = "ijungle-training-pipeline"
subsample_list = [4096, 2048, 1024, 512]
trees_list = [500, 100, 20, 10]
train_expected_m = 50000
overhead_expected_m = 50000



# 2. Preparation of cluster, environment and run configuration

In [3]:

ws = Workspace.from_config()

# Verify that cluster does not exist already
try:
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

# Creation of environment
new_env = Environment(environment_name)
packages = CondaDependencies.create(
    conda_packages=['pip'],
    pip_packages=['azureml-defaults','scikit-learn','pandas','pyarrow'])

# Add iJungle library

whl_filename = "../dist/iJungle-"+iJungle.__version__+"-py3-none-any.whl"

whl_url = Environment.add_private_pip_wheel(workspace=ws,file_path = whl_filename, exist_ok=True)
packages.add_pip_package(whl_url)


# Add the dependencies to the environment
new_env.python.conda_dependencies = packages

# Register the environment 
new_env.register(workspace=ws)
registered_env = Environment.get(ws, environment_name)

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")


Found existing cluster, use it.
Run configuration created.



# 3. Data preparation and dataset registration

*TODO: description of the data*

1. Use the following data in this repository *TODO: KDD url to download the files*
    - kddcup.names
    - kddcup.data
    - corrected

In [4]:
## Move to data directory
os.chdir(os.path.dirname(os.path.abspath('__file__'))+'/../data')

## Generate DataFrame with kdd data(csv format)
names = list(pd.read_csv('kddcup.names',sep=':', header=None)[0])
df = pd.read_csv('kddcup.data.gz', header=None, names=names)
df_test = pd.read_csv('corrected.gz', header=None, names=names)

print("Shape of raw data:", df.shape)
print("Shape of test data:", df_test.shape)

# Remove entries which protocol is not Http
df = df[df.service == 'http']
df_test = df_test[df_test.service == 'http']
print("Shape of filtered train data:", df.shape)
print("Shape of filtered test data:", df_test.shape)

# Preparation of labels
y_train = df.pop('label')
y_test = df_test.pop('label')
y_train = pd.Series([1 if val == 'normal.' else -1 for val in y_train], name="y")
y_test = pd.Series([1 if val == 'normal.' else -1 for val in y_test], name="y")
print("Shape of train labels:", y_train.shape)
print("Shape of test labels:", y_test.shape)

# Final preparation of training and testing data
df.drop(['service'], axis=1, inplace=True)
df_test.drop(['service'], axis=1, inplace=True)

cat_columns = ['protocol_type', 'flag']

for col in cat_columns:
    df_test[col] = df_test[col].astype('category')
    df[col] = df[col].astype('category')

cat_columns = df.select_dtypes(['category']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

cat_columns = df_test.select_dtypes(['category']).columns
df_test[cat_columns] = df_test[cat_columns].apply(lambda x: x.cat.codes)

df.reset_index(inplace=True)
df_test.reset_index(inplace=True)
df_y_test = y_test.reset_index()

print("Shape of train data:", df.shape)
print("Shape of test data:", df_test.shape)
print("Shape of y-test data:", df_y_test.shape)

datastore = Datastore.get(ws, working_datastore_name)

print("Registering training dataset ...")
train_dataset = Dataset.Tabular.register_pandas_dataframe(df, datastore, training_dataset_name)

print("Registering testing dataset ...")
test_dataset = Dataset.Tabular.register_pandas_dataframe(df_test, datastore, test_dataset_name)

print("Registering y-testing dataset ...")
y_test_dataset = Dataset.Tabular.register_pandas_dataframe(df_y_test, datastore, y_test_dataset_name)


Shape of raw data: (4898431, 42)
Shape of test data: (311029, 42)
Shape of filtered train data: (623091, 42)
Shape of filtered test data: (41237, 42)
Shape of train labels: (623091,)
Shape of test labels: (41237,)
Shape of train data: (623091, 41)
Shape of test data: (41237, 41)
Shape of y-test data: (41237, 2)
Registering training dataset ...
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/bd6448b8-8c72-4b54-be45-cd4e6ab5e212/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Registering testing dataset ...
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/8d4f60e5-0127-4d71-8720-b0290a26ebce/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Registering y-testing da

# 2. Creation of training pipeline

In [5]:
# Get the training dataset
train_ds = ws.datasets.get(training_dataset_name)

# Intermadiate data
dataprep_output = OutputFileDatasetConfig(
    name="processed_data", 
    destination=(
        ws.datastores.get(working_datastore_name), 
        "invoices/{run-id}/{output-name}")
).as_upload()

# Step 1, Run the data prep script
prep_step = PythonScriptStep(
    name = "Feature engineering Step",
    source_directory = "../scripts",
    script_name = "feat_eng.py",
    arguments = [
        '--input-data', train_ds.as_named_input('input'),
        '--prepped-data', dataprep_output,
        '--index-feature', index_feature,    
        '--training', 'True'    
    ],
    outputs=[dataprep_output],
    compute_target = pipeline_cluster,
    runconfig = pipeline_run_config,
    allow_reuse = False
)

# Initial definition of the pipeline steps
pipeline_steps = [prep_step]


In [6]:
# Next Step, run the training script

dataprep_input = dataprep_output.as_input()
node_count = int(pipeline_cluster.serialize()['properties']['properties']['scaleSettings']['maxNodeCount'])

model_output_dir = OutputFileDatasetConfig(
    name="model_output", 
    destination=(
        ws.datastores.get(working_datastore_name), 
        "invoices/{run-id}/{output-name}")
).as_upload()

script_config = ScriptRunConfig(
    source_directory="../scripts",
    script="training.py",
    arguments = [
        '--training-folder', dataprep_input,
        '--max-subsample-size', max(subsample_list),
        '--model-output', model_output_dir,
        '--id-feat', index_feature,
        '--train-expected-m', train_expected_m
    ],
    run_config = pipeline_run_config
)

params = GridParameterSampling(
    {
        '--trees': choice(trees_list),
        '--subsample-size' : choice(subsample_list)
    }
)

hyperdrive_config = HyperDriveConfig(
    run_config = script_config, 
    hyperparameter_sampling = params, 
    policy = None, 
    primary_metric_name = 'Dummy', 
    primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, 
    max_total_runs = len(trees_list)*len(subsample_list), 
    max_concurrent_runs = node_count
) 

train_step = HyperDriveStep(
    name = "iJungle Trainining Step", 
    hyperdrive_config = hyperdrive_config, 
    inputs=[dataprep_input],
    outputs=[model_output_dir],
    allow_reuse=False
)

pipeline_steps.append(train_step)

In [7]:
# Next step, overhead dataset calculation

overhead_ds_output = OutputFileDatasetConfig(
    name="overhead_ds_output", 
    destination=(
        ws.datastores.get(working_datastore_name), 
        "invoices/{run-id}/{output-name}")
).as_upload()

overhead_ds_step = PythonScriptStep(
    name = "Overhead Dataset Step",
    source_directory = "../scripts",
    script_name = "overhead_ds.py",
    arguments = [
        '--input-data', dataprep_input,
        '--overhead-data', overhead_ds_output,
        '--overhead-expected-m', overhead_expected_m
    ],
    inputs=[dataprep_input],
    outputs=[overhead_ds_output],
    compute_target = pipeline_cluster,
    runconfig = pipeline_run_config,
    allow_reuse = False
)
pipeline_steps.append(overhead_ds_step)


In [8]:
# Next step, run the overhead script

model_input_dir = model_output_dir.as_input()
overhead_ds_input = overhead_ds_output.as_input()

overhead_output = OutputFileDatasetConfig(
    name="overhead_output", 
    destination=(
        ws.datastores.get(working_datastore_name), 
        "invoices/{run-id}/{output-name}")
).as_upload()

script_config = ScriptRunConfig(
    source_directory="../scripts",
    script="overhead.py",
    arguments = [
        '--overhead-folder', overhead_ds_input,
        '--model-input', model_input_dir,
        '--overhead-output', overhead_output,
        '--id-feat', index_feature
        ],
    run_config = pipeline_run_config
)

params = GridParameterSampling(
    {
        '--trees': choice(trees_list),
        '--subsample-size' : choice(subsample_list)
    }
)

hyperdrive_config = HyperDriveConfig(
    run_config = script_config, 
    hyperparameter_sampling = params, 
    policy = None, 
    primary_metric_name = 'Dummy', 
    primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, 
    max_total_runs = len(trees_list)*len(subsample_list), 
    max_concurrent_runs = node_count
) 

overhead_step = HyperDriveStep(
    name = "iJungle Overhead Step", 
    hyperdrive_config = hyperdrive_config, 
    inputs=[overhead_ds_input, model_input_dir],
    outputs=[overhead_output],
    allow_reuse=False
)

pipeline_steps.append(overhead_step)

In [9]:
# Next steps, find the representative iForest

overhead_input = overhead_output.as_input()

best_iforest_step = PythonScriptStep(
    name = "Best iForest Step",
    source_directory = "../scripts",
    script_name = "best_iforest.py",
    arguments = [
        '--overhead-input', overhead_input,
        '--subsample-list', str(subsample_list),
        '--trees-list', str(trees_list)
    ],
    inputs=[overhead_input],
    compute_target = pipeline_cluster,
    runconfig = pipeline_run_config,
    allow_reuse = False
)
pipeline_steps.append(best_iforest_step)

In [10]:
# Construct the pipeline
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = pipeline_name)
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")


Pipeline is built.
Created step Feature engineering Step [1dab3f3a][da82670e-3f7e-4be0-88f6-507debe692f6], (This step will run and generate new outputs)
Created step iJungle Trainining Step [6b78089a][3c5b501f-987f-4dea-adae-e9613855b04c], (This step will run and generate new outputs)Created step Overhead Dataset Step [03970c0c][d8d83657-f3f1-4806-97c6-f40170b8f0b0], (This step will run and generate new outputs)

Created step iJungle Overhead Step [5e5520f5][074c153a-7a82-4e83-96da-6e62b1d5cedb], (This step will run and generate new outputs)
Created step Best iForest Step [26f37c61][5c436f27-c6b8-42b9-a0bd-5b91a3ea06eb], (This step will run and generate new outputs)
Submitted PipelineRun d571febb-ba82-4190-874a-4823dd9e978d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/d571febb-ba82-4190-874a-4823dd9e978d?wsid=/subscriptions/d412dac0-d902-4cfb-b2f9-19dea115f7ff/resourcegroups/rg-dv-aidnaanomaly-corp-eus2/workspaces/wsmldvanomaly&tid=973ba820-4a58-4246-84bf-170e50b315