# Register Dataset

In this notebook we register the dataset.

In [None]:
from azureml.core import Workspace, Dataset, Datastore, VERSION
from azureml.core.compute import AmlCompute

In [None]:
VERSION

In [None]:
ws = Workspace.from_config()
dstor = ws.datastores['workspaceblobstore']

In [None]:
ws.get_details()

In [None]:
ds_train = Dataset.auto_read_files(dstor.path("airlift/data-latest.csv"))
ds_score = Dataset.auto_read_files(dstor.path("airlift/data-to-score.csv"))
ds_train.head(3)

In [None]:
ds_train_def = ds_train.get_definition()
ds_score_def = ds_score.get_definition()

In [None]:
ds_train_def = ds_train_def.rename_columns({"BOUGHT_CATEGORY_FNN":"BOUGHT_CATEGORY_1",
                                "BOUGHT_CATEGORY_WLN":"BOUGHT_CATEGORY_2",
                               "CATEGORY_FNN_CLICKS":"CATEGORY_1_CLICKS",
                               "CATEGORY_WLN_CLICKS":"CATEGORY_2_CLICKS",
                               "CATEGORY_FNN_SPEND":"CATEGORY_1_SPEND",
                               "CATEGORY_WLN_SPEND":"CATEGORY_2_SPEND",
                                "CATEGORY_FNN_COUNT":"CATEGORY_1_COUNT",
                               "CATEGORY_WLN_COUNT":"CATEGORY_2_COUNT"})

ds_score_def = ds_score_def.rename_columns({"CATEGORY_FNN_CLICKS":"CATEGORY_1_CLICKS",
                               "CATEGORY_WLN_CLICKS":"CATEGORY_2_CLICKS",
                               "CATEGORY_FNN_SPEND":"CATEGORY_1_SPEND",
                               "CATEGORY_WLN_SPEND":"CATEGORY_2_SPEND",
                                "CATEGORY_FNN_COUNT":"CATEGORY_1_COUNT",
                               "CATEGORY_WLN_COUNT":"CATEGORY_2_COUNT"})

ds_train_def.head(3).columns

In [None]:
ds_train.update_definition(ds_train_def, "column changes")
ds_score.update_definition(ds_score_def, "column changes")
ds_train.register(ws, name="AirliftDataset-Train", exist_ok=True)
ds_score.register(ws, name="AirliftDataset-Score", exist_ok=True)

In [None]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget

aml_compute = ws.get_default_compute_target("CPU")

if aml_compute is None:
    amlcompute_cluster_name = "onenode-cpu"
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D2_V2",
                                                                max_nodes = 1)

    aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)
    aml_compute.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)

aml_compute

In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Create a new runconfig object
aml_run_config = RunConfiguration()

# Use the aml_compute you created above. 
aml_run_config.target = aml_compute

# Enable Docker
aml_run_config.environment.docker.enabled = True

# Set Docker base image to the default CPU-based image
aml_run_config.environment.docker.base_image = "mcr.microsoft.com/azureml/base:0.2.1"

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
aml_run_config.environment.python.user_managed_dependencies = False

# Auto-prepare the Docker image when used for execution (if it is not already prepared)
aml_run_config.auto_prepare_environment = True

# Specify CondaDependencies obj, add necessary packages
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn'], 
    pip_packages=['azureml-sdk', 'azureml-dataprep', 'azureml-train-automl'], 
    pin_sdk_version=False)

print ("Run configuration created.")

In [None]:
ct = AmlCompute(ws, "onenode-cpu")
ds_train.generate_profile(ct, workspace=ws)
ds_score.generate_profile(ct, workspace=ws)