Copyright (c) Microsoft. All rights reserved.

Licensed under the MIT license.

In [None]:
import azureml.core
from azureml.core import Workspace

ws = Workspace.from_config()

# Get the default datastore
default_ds = ws.get_default_datastore()

default_ds.upload_files(files=['./Data/borrower.csv', './Data/loan.csv'], # Upload the diabetes csv files in /data
                       target_path='creditrisk-data/', # Put it in a folder path in the datastore
                       overwrite=True, # Replace existing files of the same name
                       show_progress=True)

#Create a Tabular dataset from the path on the datastore
from azureml.core import Dataset

tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'creditrisk-data/borrower.csv'))

tab_data_set = tab_data_set.register(workspace=ws,
                                        name='BorrowerData',
                                        description='Borrower Data',
                                        tags = {'format':'CSV'},
                                        create_new_version=True)

#Create a Tabular dataset from the path on the datastore
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'creditrisk-data/loan.csv'))

tab_data_set = tab_data_set.register(workspace=ws,
                                        name='LoanData',
                                        description='Loans Data',
                                        tags = {'format':'CSV'},
                                        create_new_version=True)

In [None]:
from azureml.core import Workspace, Dataset, Datastore, ScriptRunConfig, Experiment
from azureml.data.data_reference import DataReference
import os
import azureml.dataprep as dprep
import pandas as pd
import numpy as np

import azureml.core
from azureml.core import Workspace

ws = Workspace.from_config()

borrowerData = Dataset.get_by_name(ws, name='BorrowerData')
loanData = Dataset.get_by_name(ws, name='LoanData')

In [None]:
from azureml.core import Datastore
from azureml.core.compute import AmlCompute, ComputeTarget

datastore = ws.get_default_datastore()

# Create a compute cluster
compute_name = 'cpu-cluster'
if not compute_name in ws.compute_targets :
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS2_V2',
                                                                min_nodes=0,
                                                                max_nodes=1)
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # Show the result
    print(compute_target.get_status().serialize())

compute_target = ws.compute_targets[compute_name]

In [None]:
from azureml.core.runconfig import RunConfiguration

from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

# Create a Python environment for the experiment
creditrisk_env = Environment("creditrisk-pipeline-env")
creditrisk_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
creditrisk_env.docker.enabled = True # Use a docker container

# Create a set of package dependencies
creditrisk_packages = CondaDependencies.create(conda_packages=['scikit-learn','joblib','pandas','numpy','pip'],
                                             pip_packages=['azureml-defaults','azureml-dataprep[pandas]'])


# Add the dependencies to the environment
creditrisk_env.python.conda_dependencies = creditrisk_packages

# Register the environment 
creditrisk_env.register(workspace=ws)
registered_env = Environment.get(ws, 'creditrisk-pipeline-env')

# Create a new runconfig object for the pipeline
aml_run_config = RunConfiguration()

# Use the compute you created above. 
aml_run_config.target = compute_target

# Assign the environment to the run configuration
aml_run_config.environment = registered_env

print ("Run configuration created.")

In [None]:
%%writefile PrepareData.py
from azureml.core import Run

import pandas as pd 
import numpy as np 
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--prepared_data', dest='prepared_data', required=True)
args = parser.parse_args()
    
borrowerData = Run.get_context().input_datasets['BorrowerData']
loanData = Run.get_context().input_datasets['LoanData']

df_borrower = borrowerData.to_pandas_dataframe()
df_loan = loanData.to_pandas_dataframe()

# Join data and do some transformations
df_data = df_borrower.merge(df_loan,on='memberId',how='inner')
df_data.shape

df_data['homeOwnership'] = df_data['homeOwnership'].replace('nan', np.nan).fillna(0)
df_data['isJointApplication'] = df_data['isJointApplication'].replace('nan', np.nan).fillna(0)

drop_cols = ['memberId', 'loanId', 'date','grade','residentialState']
df_data = df_data.drop(drop_cols, axis=1)

df_data['loanStatus'] = np.where(df_data['loanStatus'] == 'Default', 1, 0) # change label column to 0/1

df_data.to_csv(os.path.join(args.prepared_data,"prepared_data.csv"),index=False)

print(f"Wrote prepped data to {args.prepared_data}/prepared_data.csv")

In [None]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep

prepared_data = OutputFileDatasetConfig(name="prepared_data")

dataprep_step = PythonScriptStep(
    name="PrepareData", 
    script_name="PrepareData.py", 
    compute_target=compute_target, 
    runconfig=aml_run_config,
    arguments=["--prepared_data", prepared_data],
    inputs=[borrowerData.as_named_input('BorrowerData'),loanData.as_named_input('LoanData')],
    allow_reuse=True
)

In [None]:
# prepared_data = prepared_data_path.read_delimited_files()

In [None]:
%%writefile TrainTestDataSplit.py
from azureml.core import Run

import pandas as pd 
import numpy as np 
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--prepared_data', dest='prepared_data', required=True)
parser.add_argument('--train_data', dest='train_data', required=True)
parser.add_argument('--test_data', dest='test_data', required=True)
args = parser.parse_args()

df_data = pd.read_csv(args.prepared_data + '/prepared_data.csv')

df_train=df_data.sample(frac=0.8,random_state=200) #random state is a seed value
df_train=df_data.drop(df_train.index)

df_train.to_csv(os.path.join(args.train_data,"train_data.csv"),index=False)
df_train.to_csv(os.path.join(args.test_data,"test_data.csv"),index=False)

print(f"Wrote prepped data to {args.train_data}/train_data.csv")
print(f"Wrote prepped data to {args.test_data}/test_data.csv")

In [None]:
# test train split the data 
train_data = OutputFileDatasetConfig(name="train_data")
test_data = OutputFileDatasetConfig(name="test_data")

test_train_step = PythonScriptStep(name = "TestTrainDataSplit",
                                 script_name ="TrainTestDataSplit.py",
                                 arguments = ["--prepared_data", prepared_data.as_input(),
                                              "--train_data", train_data,
                                              "--test_data", test_data],
                                  outputs = [train_data,test_data],
                                  compute_target = compute_target, 
                                  runconfig = aml_run_config, 
                                  allow_reuse = True
                                  )

In [None]:
training_data = train_data.read_delimited_files()
training_data

testing_data = test_data.read_delimited_files()
testing_data

In [None]:
%%writefile TrainModel.py

from azureml.core import Run
from azureml.core.model import Model
import joblib

import pandas as pd 
import numpy as np 
import argparse

from sklearn.linear_model import LogisticRegression

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

def creditrisk_onehot_encoder(df_data):
    catColumns = df_data.select_dtypes(['object']).columns
    df_data[catColumns] = df_data[catColumns].fillna(value='Unknown')
    
    df_data = df_data.fillna(df_data.mean())
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_cols= pd.DataFrame(OH_encoder.fit_transform(df_data[catColumns]),columns = list(OH_encoder.get_feature_names(catColumns)))
    
    # Remove categorical columns (will replace with one-hot encoding)
    numeric_cols = df_data.drop(catColumns, axis=1)
    
    # Add one-hot encoded columns to numerical features
    df_result = pd.concat([numeric_cols, OH_cols], axis=1)
    
    # impute missing numeric values with mean
    fill_NaN = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputed_df = pd.DataFrame(fill_NaN.fit_transform(df_result))
    imputed_df.columns = df_result.columns
    imputed_df.index = df_result.index
    df_result = imputed_df

    return(df_result)

# Get the experiment run context
run = Run.get_context()

parser = argparse.ArgumentParser()
parser.add_argument('--train_data', dest='train_data', required=True)
parser.add_argument('--test_data', dest='test_data', required=True)
parser.add_argument('--metrics_data', dest='metrics_data', required=True)
parser.add_argument('--model_data', dest='model_data', required=True)
args = parser.parse_args()

df_train = pd.read_csv(args.train_data + '/train_data.csv')
df_test = pd.read_csv(args.test_data + '/test_data.csv')

df_train = creditrisk_onehot_encoder(df_train)
df_test = creditrisk_onehot_encoder(df_test)

cols = [col for col in df_train.columns if col not in ["loanStatus"]]

clf = LogisticRegression()
clf.fit(df_train[cols].values, df_train["loanStatus"].values)

print('predicting ...')
y_hat = clf.predict(df_test[cols].astype(int).values)

acc = np.average(y_hat == df_test["loanStatus"].values)
print('Accuracy is', acc)

print("save model")
os.makedirs('models', exist_ok=True)    
joblib.dump(value=clf, filename= 'models/creditrisk_model.pkl')

model = Model.register(model_path = 'models/creditrisk_model.pkl',
                    model_name = 'creditrisk_model',
                    description = 'creditrisk model',
                    workspace = run.experiment.workspace,
                    properties={'Accuracy': np.float(acc)})

modeldata = []
modeldata.append(('models/creditrisk_model.pkl','creditrisk_model'))
df_model = pd.DataFrame(modeldata, columns=('modelfile', 'model_name'))

metricsdata = []
metricsdata.append(('Accuracy',acc))
df_metrics = pd.DataFrame(metricsdata, columns=('Metric', 'Value'))

df_model.to_csv(os.path.join(args.model_data,"model_data.csv"),index=False)
df_metrics.to_csv(os.path.join(args.metrics_data,"metrics_data.csv"),index=False)

print(f"Wrote model data to {args.model_data}/model_data.csv")
print(f"Wrote metrics data to {args.metrics_data}/metrics_data.csv")


In [None]:
# train the model
model_data = OutputFileDatasetConfig(name="model_data")
metrics_data = OutputFileDatasetConfig(name="metrics_data")

train_step = PythonScriptStep(name = "TrainModel",
                                 script_name ="TrainModel.py",
                                 arguments = ["--train_data", train_data.as_input(),
                                              "--test_data", test_data.as_input(),
                                              "--model_data", model_data,
                                              "--metrics_data", metrics_data],
                                  outputs = [model_data,metrics_data],
                                  compute_target = compute_target, 
                                  runconfig = aml_run_config, 
                                  allow_reuse = True
                                  )

In [None]:
%%writefile BatchInference.py
from azureml.core import Run
from azureml.core.model import Model
import joblib

import pandas as pd 
import numpy as np 
import argparse

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

def creditrisk_onehot_encoder(df_data):
    catColumns = df_data.select_dtypes(['object']).columns
    df_data[catColumns] = df_data[catColumns].fillna(value='Unknown')
    
    df_data = df_data.fillna(df_data.mean())
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_cols= pd.DataFrame(OH_encoder.fit_transform(df_data[catColumns]),columns = list(OH_encoder.get_feature_names(catColumns)))
    
    # Remove categorical columns (will replace with one-hot encoding)
    numeric_cols = df_data.drop(catColumns, axis=1)
    
    # Add one-hot encoded columns to numerical features
    df_result = pd.concat([numeric_cols, OH_cols], axis=1)
    
    # impute missing numeric values with mean
    fill_NaN = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputed_df = pd.DataFrame(fill_NaN.fit_transform(df_result))
    imputed_df.columns = df_result.columns
    imputed_df.index = df_result.index
    df_result = imputed_df

    return(df_result)

parser = argparse.ArgumentParser()
parser.add_argument('--test_data', dest="test_data", type=str, required=True)
parser.add_argument('--model_data', dest="model_data", type=str, required=True)
parser.add_argument('--batchinfer_data', dest='batchinfer_data', required=True)

args = parser.parse_args()

# Get the experiment run context
run = Run.get_context()

df_model = pd.read_csv(args.model_data + '/model_data.csv')
# model_path = Model.get_model_path(model_name = 'best_model_data')
model_name = df_model['model_name'][0]

model_path = Model.get_model_path(model_name=model_name, _workspace=run.experiment.workspace)
model = joblib.load(model_path)

df_test = pd.read_csv(args.test_data + '/test_data.csv')
df_test = creditrisk_onehot_encoder(df_test)

x_test = df_test.drop(['loanStatus'], axis=1)

y_predict = model.predict(x_test)

df_test['Prediction'] = y_predict

df_test.to_csv(os.path.join(args.batchinfer_data,"batchinfer_data.csv"),index=False)

print(f"Wrote prediction data with to {args.batchinfer_data}/batchinfer_data.csv")

In [None]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep

batchinfer_data = OutputFileDatasetConfig(name="batchinfer_data").register_on_complete(name="CreditRiskBatchInferenceData",description = 'Batch Inference Data Output')

batchinfer_step = PythonScriptStep(
    name="RunBatchInference", 
    script_name="BatchInference.py", 
    compute_target=compute_target, 
    runconfig=aml_run_config,
    arguments=["--test_data", test_data.as_input(),"--model_data", model_data.as_input(),"--batchinfer_data", batchinfer_data],
    outputs = [batchinfer_data],
    allow_reuse=True
)

In [None]:
from azureml.pipeline.core import Pipeline
from azureml.core import Experiment

pipeline = Pipeline(ws, [dataprep_step, test_train_step, train_step,batchinfer_step])

experiment = Experiment(workspace=ws, name='CreditRiskPipeline')

run = experiment.submit(pipeline, show_output=True)
run.wait_for_completion()