In [1]:
from azureml.core import Dataset, Datastore
from azureml.core import Workspace
import os


ws = Workspace.from_config()
datastore =  ws.get_default_datastore()



In [5]:
dataset_name = "HrAnalytics-DataScientist"
description = """"""
local_path = r"../data/"
dataset_file = 'aug_train.csv'
datastore_path = 'udacity-lancia'
datastore.upload(src_dir=local_path, target_path=datastore_path)


dataset_file_remote = os.path.join(datastore_path,dataset_file)
ds = Dataset.Tabular.from_delimited_files(path=[(datastore, (dataset_file_remote))])


try:
    ds = ds.register(ws,
                name= dataset_name,
                description=description)
except Exception:
    print("creating new version of dataset")
    ds = ds.register(ws,
                name= dataset_name,
                description=description,
                create_new_version=True)


df = ds.to_pandas_dataframe()
df.head()

Uploading an estimated of 3 files
Target already exists. Skipping upload for udacity-lancia/.amlignore
Target already exists. Skipping upload for udacity-lancia/.amlignore.amltmp
Target already exists. Skipping upload for udacity-lancia/.gitignore
Uploaded 0 files


Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


## Pipeline for preprocessing

In [8]:
from azureml.core.runconfig import RunConfiguration
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
env = Environment('dataprep')

conda = CondaDependencies()
conda.add_conda_package("python=3.8")

# add pip packages
conda.add_pip_package('azureml-core')
conda.add_pip_package('pandas')
conda.add_pip_package('numpy')
conda.add_pip_package("scikit-learn==0.22.2.post1")

# add conda dependencies to environment
env.python.conda_dependencies = conda


In [9]:
from azureml.core.compute import ComputeTarget
def_compute_target = ComputeTarget(ws,name="lancia")


runconfig = RunConfiguration()
runconfig.environment  = env
runconfig.target = def_compute_target




In [10]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep


prepped_data = PipelineData(name="prepared_data").as_dataset()

dataprep_step = PythonScriptStep(
    name="dataprep", 
    script_name="dataprep.py",
    source_directory="./steps_scripts/prep",
    arguments=["--output_path", prepped_data],
    inputs=[ds.as_named_input('input_ds')],
    outputs=[prepped_data],
    runconfig=runconfig,
    allow_reuse=True
)


In [11]:
from azureml.pipeline.core import PipelineData
from azureml.data import OutputFileDatasetConfig

train_data = PipelineData(name="train_data").as_dataset()
test_data =  PipelineData(name="test_data").as_dataset()

split_step = PythonScriptStep(
    name="train_test_split", 
    script_name="train_test_split.py", 
    source_directory="./steps_scripts/split/",
    arguments=["--train_path", train_data, "--test_path", test_data],
    inputs=[prepped_data.parse_delimited_files().as_named_input("prepped_data")],
    outputs=[train_data, test_data],
    runconfig=runconfig,
    allow_reuse=True
)



In [29]:
from azureml.pipeline.core import PipelineData
from azureml.data import OutputFileDatasetConfig

dataset_prefix  = "HRAnalytics"
register_dastaset_step = PythonScriptStep(
    name="Register_Datasets", 
    script_name="register_dataset.py", 
    source_directory="./steps_scripts/dataset/",
    arguments=["--dataset_prefix", dataset_prefix, "--register_datasets", "train_dataset", "test_dastest"], 
    inputs=[train_data.parse_delimited_files().as_named_input("train_dataset"), test_data.parse_delimited_files().as_named_input("test_dastest")],
    runconfig=runconfig,
    allow_reuse=True
)





In [30]:
from azureml.pipeline.core import Pipeline
pipeline = Pipeline(
    description="AzureMlE-preprocessing",
    workspace=ws,    
    steps=[dataprep_step,
    split_step,
    register_dastaset_step])

In [31]:
from azureml.core.experiment import Experiment

experiment= Experiment(ws, "AzureMlE-Preprocessing-Pipe")

remote_run = experiment.submit(pipeline)

Created step dataprep [2ad6f71f][b152e88b-97f3-423d-88d0-6cab2c73928d], (This step is eligible to reuse a previous run's output)
Created step train_test_split [6e231a9d][8912289d-6bb6-43f1-9f1e-eb76442432c1], (This step is eligible to reuse a previous run's output)
Created step Register_Datasets [5f6d11e1][9cf1c8b9-c65b-4e9e-8ebc-dbc128447767], (This step will run and generate new outputs)
Submitted PipelineRun 1acc1a5f-c231-497e-b8d8-8b0e29262934
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1acc1a5f-c231-497e-b8d8-8b0e29262934?wsid=/subscriptions/ec5ba19e-6205-418f-a52d-d0943090ca16/resourcegroups/rg-wwe-ictx-dsplayground/workspaces/aml-wwe-ictx-dsplay&tid=c16e514b-893e-4a01-9a30-b8fef514a650
