In [1]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

In [2]:
ml_client = MLClient.from_config(credential=credential)

Found the config file in: /config.json


In [3]:
import os 
script_folder = "src"
os.makedirs(script_folder, exist_ok=True)
print(script_folder, "source folder created")


src source folder created


In [63]:
%%writefile $script_folder/prep-data.py
import argparse
import pandas as pd
import numpy as np
from pathlib import Path
import glob


def main(args):
    
    df = getData(args.input_data)
    remove_na_columns(df)
    remove_na_rows(df)
    remove_id(df)
    output_df = df.to_csv((Path(args.output_data) / "energy_processed_data.csv"), index = False)


def remove_na_columns(df):
    df.dropna(axis=1, thresh= 0.3*len(df), inplace=True)

def remove_na_rows(df):
    df.dropna(axis = 0, inplace = True)

def remove_id(df):
    df.drop(columns = ['trainrow_id', 'segment'], axis=1, inplace = True)

def getData(path):
    all_files = glob.glob(path + "/part*.csv")
    df = pd.concat((pd.read_csv(f) for f in all_files), sort=False) 
    return df

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_data", dest= "input_data", type = str)
    parser.add_argument("--output_data", dest="output_data", type = str)
    args = parser.parse_args()
    return args

if __name__ == "__main__":

    args = parse_args()
    main(args)


Overwriting src/prep-data.py


In [64]:
%%writefile $script_folder/train-model.py
# import libraries
import mlflow
import glob
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

def main(args):
    mlflow.autolog()

    df = get_data(args.training_data)
    x_train, x_test, y_train, y_test = split_data(df)
    model = train_model(args.alpha, args.l1_ratio, x_train, y_train)
    eval_model(model, x_test, y_test)

def get_data(path):
    all_files = glob.glob(path + "/*.csv")
    df = pd.concat((pd.read_csv(f) for f in all_files), sort=False) 
    return df
    

def split_data(df):
    print("splitting data")
    X, y = df.drop("traintarget", axis = 1), df["traintarget"]
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return x_train, x_test, y_train, y_test

def train_model(alpha, l1_ratio, x_train, y_train):
    print("training data")
    elastic_net = ElasticNet(alpha = 0.1, l1_ratio = 0.5)
    model = elastic_net.fit(x_train, y_train)
    mlflow.sklearn.save_model(model, args.model_output)
    return model

def eval_model(model, x_test, y_test):
    print("evaluating data")
    y_pred = model.predict(x_test)
    mean_abs_error = mean_absolute_error(y_test, y_pred)
    print("Mean Absolute Error :", mean_abs_error)

def parse_args():

    parser = argparse.ArgumentParser()

    parser.add_argument("--training_data", dest = "training_data", type = str)
    parser.add_argument("--alpha", dest = "alpha", type = float,  default = 0.01)
    parser.add_argument("--l1_ratio", dest = "l1_ratio", type = float,  default = 0.5)
    parser.add_argument("--model_output", dest="model_output", type=str)

    args = parser.parse_args()

    return args
    
if __name__ == "__main__":

    args = parse_args()
    main(args)




    


Overwriting src/train-model.py


In [65]:
%%writefile prep-data.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: prep_data
display_name: Prepare training data
version: 1
type: command
inputs:
  input_data: 
    type: uri_folder
outputs:
  output_data:
    type: uri_folder
code: ./src
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
command: >-
  python prep-data.py 
  --input_data ${{inputs.input_data}}
  --output_data ${{outputs.output_data}}


Overwriting prep-data.yml


In [76]:
%%writefile train-model.yml
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: train_model
display_name: Train a ElasticNet Regressor
version: 1
type: command
inputs:
  training_data: 
    type: uri_folder
  alpha:
    type: number
    default: 0.1
  l1_ratio:
    type: number
    default: 1
outputs:
  model_output:
    type: mlflow_model
code: ./src
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
command: >-
  python train-model.py 
  --training_data ${{inputs.training_data}} 
  --alpha ${{inputs.alpha}}
  --l1_ratio ${{inputs.l1_ratio}}
  --model_output ${{outputs.model_output}} 

Overwriting train-model.yml


In [77]:
from azure.ai.ml import load_component

parent_dir = ""

prep_data = load_component(source=parent_dir + "./prep-data.yml")
train_elastic_net = load_component(source = parent_dir + "./train-model.yml")

In [78]:
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.dsl import pipeline

path = "azureml://subscriptions/bed94d1c-96c0-4620-93e0-311ec364075a/resourcegroups/energy_prediction/workspaces/energ_prediction_ml/datastores/energy_datas_hover"

@pipeline
def energy_prediction(pipeline_job_input):
    process_data = prep_data(input_data = pipeline_job_input)
    train_model = train_elastic_net(training_data=process_data.outputs.output_data)

    return {
        "pipeline_job_transformed_data": process_data.outputs.output_data,
        "pipeline_job_trained_model": train_model.outputs.model_output,
    }
pipeline_job = energy_prediction(Input(type = AssetTypes.URI_FOLDER, path="azureml://subscriptions/bed94d1c-96c0-4620-93e0-311ec364075a/resourcegroups/energy_prediction/workspaces/energ_prediction_ml/datastores/energy_datas_hover/paths/transformeddata/transformed_energy_data"))

In [69]:
print(pipeline_job)

display_name: energy_prediction
type: pipeline
inputs:
  pipeline_job_input:
    type: uri_folder
    path: azureml://subscriptions/bed94d1c-96c0-4620-93e0-311ec364075a/resourcegroups/energy_prediction/workspaces/energ_prediction_ml/datastores/energy_datas_hover/paths/transformeddata/transformed_energy_data
outputs:
  pipeline_job_transformed_data:
    type: uri_folder
  pipeline_job_trained_model:
    type: mlflow_model
jobs:
  process_data:
    type: command
    inputs:
      input_data:
        path: ${{parent.inputs.pipeline_job_input}}
    outputs:
      output_data: ${{parent.outputs.pipeline_job_transformed_data}}
    component:
      $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
      name: prep_data
      version: '1'
      display_name: Prepare training data
      type: command
      inputs:
        input_data:
          type: uri_folder
      outputs:
        output_data:
          type: uri_folder
      command: python prep-data.py  --inp

In [79]:
pipeline_job.outputs.pipeline_job_transformed_data.mode = "upload"
pipeline_job.outputs.pipeline_job_trained_model.mode = "upload"
pipeline_job.settings.default_compute = "energycomputetrain"
pipeline_job.settings.default_datastore = "workspaceblobstore"
print(pipeline_job)

display_name: energy_prediction
type: pipeline
inputs:
  pipeline_job_input:
    type: uri_folder
    path: azureml://subscriptions/bed94d1c-96c0-4620-93e0-311ec364075a/resourcegroups/energy_prediction/workspaces/energ_prediction_ml/datastores/energy_datas_hover/paths/transformeddata/transformed_energy_data
outputs:
  pipeline_job_transformed_data:
    mode: upload
    type: uri_folder
  pipeline_job_trained_model:
    mode: upload
    type: mlflow_model
jobs:
  process_data:
    type: command
    inputs:
      input_data:
        path: ${{parent.inputs.pipeline_job_input}}
    outputs:
      output_data: ${{parent.outputs.pipeline_job_transformed_data}}
    component:
      $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
      name: prep_data
      version: '1'
      display_name: Prepare training data
      type: command
      inputs:
        input_data:
          type: uri_folder
      outputs:
        output_data:
          type: uri_folder
      c

In [80]:
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="energy_prediction"
)
pipeline_job

Experiment,Name,Type,Status,Details Page
energy_prediction,clever_garden_dtkjkjb9vb,pipeline,Preparing,Link to Azure Machine Learning studio
