# Breast Cancer - Optimizing an Azure ML Pipeline

In [None]:
import argparse
import azureml
import os
import sklearn
import pandas as pd 
import numpy as np
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from azureml.core import Run, Dataset
from sklearn.preprocessing import LabelEncoder
from azureml.core import Workspace, Experiment, Run
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import ScriptRunConfig, Environment
from azureml.widgets import RunDetails
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

In [31]:
from azureml.core import  Workspace
from azureml.core.authentication import InteractiveLoginAuthentication
interactive_auth = InteractiveLoginAuthentication(tenant_id="9ce70869-60db-44fd-abe8-d2767077fc8f")

ws = Workspace.from_config()

In [242]:
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: cselscdhazureml
Azure region: eastus2
Subscription id: 320d8d57-c87c-4434-827f-59ee7d86687a
Resource group: csels-cdh-dev


In [243]:
clustername = 'StandardDS12CPU'
is_new_cluster = False
try:
    aml_compute = ComputeTarget(workspace = ws,name= clustername)
    print("Find the existing cluster")
except ComputeTargetException:
    print("Cluster not find - Creating cluster")
    is_new_cluster = True
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    aml_compute = ComputeTarget.create(ws, clustername, compute_config)

aml_compute.wait_for_completion(show_output=True)

Find the existing cluster
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [244]:
data_store = ws.get_default_datastore()
data_store.upload(src_dir='./data',target_path='cancer_data',overwrite=True,show_progress=True)
ds_raw = Dataset.Tabular.from_delimited_files(path=data_store.path('cancer_data/cancer_data.csv'))
ds_raw.register(workspace=ws,name='raw_data')

Uploading an estimated of 5 files
Uploading ./data/.amlignore
Uploaded ./data/.amlignore, 1 files out of an estimated total of 5
Uploading ./data/.amlignore.amltmp
Uploaded ./data/.amlignore.amltmp, 2 files out of an estimated total of 5
Uploading ./data/cancer_data.csv
Uploaded ./data/cancer_data.csv, 3 files out of an estimated total of 5
Uploading ./data/test/test.csv
Uploaded ./data/test/test.csv, 4 files out of an estimated total of 5
Uploading ./data/train/train.csv
Uploaded ./data/train/train.csv, 5 files out of an estimated total of 5
Uploaded 5 files


{
  "source": [
    "('workspaceblobstore', 'cancer_data/cancer_data.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "9873494e-db49-4adc-a5ed-70ff5deefffe",
    "name": "raw_data",
    "version": 1,
    "workspace": "Workspace.create(name='cselscdhazureml', subscription_id='320d8d57-c87c-4434-827f-59ee7d86687a', resource_group='csels-cdh-dev')"
  }
}

In [245]:
ds_raw = ds_raw.as_named_input('raw_data')


train_data = PipelineData("train_cancer_data",datastore=data_store).as_dataset()
test_data = PipelineData("test_cancer_data",datastore=data_store).as_dataset()
model_file = PipelineData("model_file",datastore=data_store)

In [246]:
print(train_data._input_mode)

mount


In [247]:
# Use a RunConfiguration to specify some additional requirements for this step.
from azureml.core.runconfig import RunConfiguration,DockerConfiguration
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.core.conda_dependencies import CondaDependencies


# create a new runconfig object
run_config = RunConfiguration()

# set Docker base image to the default CPU-based image
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE

# use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_config.environment.python.user_managed_dependencies = False

# specify CondaDependencies obj
run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['scikit-learn','pandas','numpy'],
    pip_packages=['joblib','azureml-sdk'],
    pin_sdk_version=False)


In [248]:
%%writefile ./scripts/prepare.py

import argparse
import os
import sklearn
import pandas as pd 
import numpy as np
from azureml.core import  Workspace
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from azureml.core import Run, Dataset
from sklearn.preprocessing import LabelEncoder

def main():

    parser = argparse.ArgumentParser("prepare")

    parser.add_argument("--input_data",type=str)
    parser.add_argument("--train",type=str)
    parser.add_argument("--test",type=str)

    args = parser.parse_args()

    print("train args:",args.train)
    
    run = Run.get_context()
    ws = run.experiment.workspace
    ds_tr = ws.get_default_datastore()


    df = run.input_datasets['raw_data'].to_pandas_dataframe()

    y = df['diagnosis'].astype('category')
    X = df.drop('diagnosis',axis=1)

    lbl_encoder = LabelEncoder()
    y_encode = lbl_encoder.fit_transform(y)

    print("cols:",X.columns)
    print("X shape", X.shape)
    print("encoder:", lbl_encoder.classes_)
    print("y encode:", y_encode.shape)

    x_train,x_test,y_train,y_test = train_test_split(X,y_encode,train_size=0.75,random_state=42,stratify =y_encode)

    print(x_train.shape)
    print(y_train.shape)

    print(x_test.shape)
    print(y_test.shape)

    train = np.column_stack([x_train,y_train])
    test = np.column_stack([x_test,y_test])
   
    # Write the model to file.
    train_path = "./data/train/"
    test_path = "./data/test/"

    os.makedirs(args.train, exist_ok=True)
    os.makedirs(args.test, exist_ok=True)
    print("Saving the split")

    np.savetxt(os.path.join(args.train,"train.csv"), train, delimiter=",")
    np.savetxt(os.path.join(args.test,"test.csv"), train, delimiter=",")
  
    #target_train = (ds_tr,"train_cancer_data")
    #target_test  = (ds_tr,"test_cancer_data")

    
    #ds_tr.upload(src_dir=train_path,target_path='train_cancer_data',overwrite=True,show_progress=True)
    #ds_tr.upload(src_dir=test_path,target_path='test_cancer_data',overwrite=True,show_progress=True)

    #ds_train = Dataset.File.upload_directory(src_dir=train_path,target=target_train,overwrite=True,show_progress=True)
    #ds_test = Dataset.File.upload_directory(src_dir=test_path,target=target_test,overwrite=True,show_progress=True)

    #ds_train.register(workspace=ws,name='train_cancer')
    #ds_test.register(workspace=ws,name='test_cancer')

 
if __name__ =='__main__':
    main()




Overwriting ./scripts/prepare.py


In [249]:
%%writefile ./scripts/train2.py


import argparse
import os
import sklearn
import pandas as pd 
import numpy as np
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from azureml.core import Run, Dataset
from sklearn.preprocessing import LabelEncoder
import joblib

def main():
    parser = argparse.ArgumentParser("train")
    
    parser.add_argument("--train", type=str, help="train data")
    parser.add_argument("--test", type=str, help="test data")
    parser.add_argument("--model_file", type=str, help="model file")
    
    args = parser.parse_args()
    
    run = Run.get_context()
    ws = run.experiment.workspace
    ds_tr = ws.get_default_datastore()

    print(args.train)
    print(args.test)

    train = pd.read_csv(args.train+"/train.csv")
    test = pd.read_csv(args.test+"/test.csv")

    y_train = train.iloc[:,-1]
    train.drop(columns = train.columns[-1],axis=1,inplace=True)
    x_train = train

    y_test = test.iloc[:,-1]
    test.drop(columns = test.columns[-1],axis=1,inplace=True)
    x_test = test

    lbl_encoder = LabelEncoder()
    y_encode = lbl_encoder.fit_transform(y_train)

    print("cols:",x_train.columns)
    print("X shape", x_train.shape)
    print("encoder:", lbl_encoder.classes_)
    print("y encode:", y_encode.shape)


    print(x_train.shape)
    print(y_train.shape)

    print(x_test.shape)
    print(y_test.shape)

    rf = RandomForestClassifier(n_estimators=40,max_depth=100,max_features='auto',min_samples_leaf=3)
    rf.fit(x_train,y_train)

    accuracy = accuracy_score(y_test,rf.predict(x_test))
    run.log("accuracy",accuracy)

    f1 = f1_score(y_test,rf.predict(x_test))
    run.log("f1_score",f1)


    # Write the model to file.
    # model_path = "./outputs/cancer_model.pkl"
    os.makedirs(args.model_file, exist_ok=True)
    joblib.dump(rf, args.model_file+"/cancer_model.pkl")

    print('Saving the model to {}'.format(args.model_file+"/cancer_model.pkl"))





    run.complete()
    

if __name__ == '__main__':
    main()


Overwriting ./scripts/train2.py


In [250]:
source_directory ='./scripts'
step1 = PythonScriptStep(name="prepare_step",
                         script_name="prepare.py", 
                         arguments=["--input_data",ds_raw,"--train",train_data,"--test",test_data],
                         inputs=[ds_raw],
                         outputs=[train_data,test_data],
                         compute_target=aml_compute, 
                         runconfig=run_config,
                         source_directory=source_directory,
                         allow_reuse=True)
print("Step1 created")

Step1 created


In [251]:
step2 = PythonScriptStep(name="train_step",
                         script_name="train2.py", 
                         arguments=["--train",train_data,"--test",test_data,"--model_file",model_file],
                         inputs=[train_data,test_data],
                         outputs=[model_file],
                         compute_target=aml_compute, 
                         runconfig=run_config,
                         source_directory=source_directory,
                         allow_reuse=True)
print("Step2 created")

Step2 created


In [252]:
%%writefile ./scripts/register.py
import argparse
from azureml.core import Run, Dataset
from azureml.core.model import Model as AMLModel
def main():


    parser = argparse.ArgumentParser("register")
    
    parser.add_argument("--model_file", type=str, help="model file")
    
    args = parser.parse_args()
    
    run = Run.get_context()
    ws = run.experiment.workspace
    ds_tr = ws.get_default_datastore()

    model_path = args.model_file+"/cancer_model.pkl"

    print("model path:",model_path)

    AMLModel.register(workspace=ws,model_name="breast-cancer",model_path=model_path)

if __name__ == '__main__':
    main()



Overwriting ./scripts/register.py


In [253]:
step3 = PythonScriptStep(name="register_step",
                         script_name="register.py", 
                         arguments=["--model_file",model_file],
                         inputs=[model_file],
                         compute_target=aml_compute, 
                         runconfig=run_config,
                         source_directory=source_directory,
                         allow_reuse=True)
print("Step3 created")

Step3 created


In [254]:
steps = [step1,step2,step3]
pipeline1 = Pipeline(workspace=ws,steps=steps)
run_exp = Experiment(workspace=ws, name="RF-BreastCancer-Pipeline")

In [255]:
run_exp.submit(pipeline1,regenerate_ouputs=True)

Created step prepare_step [021e6f0d][ba496824-c3da-40d1-b8a3-55b775f232ca], (This step will run and generate new outputs)
Created step train_step [57d11bae][bd9bdca1-371d-42a0-9f6e-cee336849046], (This step will run and generate new outputs)Created step register_step [4b3eb6dd][ace191ac-8c30-4344-a366-003c3574cc04], (This step will run and generate new outputs)

Submitted PipelineRun 360c1071-80d9-4caa-8f1c-c4bd288f72c1
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/360c1071-80d9-4caa-8f1c-c4bd288f72c1?wsid=/subscriptions/320d8d57-c87c-4434-827f-59ee7d86687a/resourcegroups/csels-cdh-dev/workspaces/cselscdhazureml&tid=9ce70869-60db-44fd-abe8-d2767077fc8f


Experiment,Id,Type,Status,Details Page,Docs Page
RF-BreastCancer-Pipeline,360c1071-80d9-4caa-8f1c-c4bd288f72c1,azureml.PipelineRun,Preparing,Link to Azure Machine Learning studio,Link to Documentation
