In [1]:
import azureml.core
import numpy as np
import pandas as pd
from azureml.core import Workspace, Datastore, Dataset
import sklearn
from sklearn.preprocessing import RobustScaler
from imblearn.under_sampling import RandomUnderSampler

### Workspace Credentials

##### If you have not configured your workspace

In [None]:
import os

subscription_id = os.getenv("SUBSCRIPTION_ID", default="yoursubscription id")
resource_group = os.getenv("RESOURCE_GROUP", default="your resource group")
workspace_name = os.getenv("WORKSPACE_NAME", default="workspacename")
workspace_region = os.getenv("WORKSPACE_REGION", default="location")

from azureml.core import Workspace

try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    # write the details of the workspace to a configuration file to the notebook library
    ws.write_config()
    print("Workspace configuration succeeded. Skip the workspace creation steps below")
except:
    print("Workspace not accessible. Change your parameters or create a new workspace below")


#### If you have already configured your workspace

In [2]:
ws = Workspace.from_config()
#print(ws)

### Provide Experiment Name

In [3]:
from azureml.core import Experiment
experiment_name = 'creditcardFraudBlob'
experiment = Experiment(workspace = ws, name = experiment_name)

### Registering Blob Storage

In [9]:
blob_datastore_name='creditcardblob' # Name of the datastore to workspace
container_name=os.getenv("BLOB_CONTAINER", "") # Name of Azure blob container
account_name=os.getenv("BLOB_ACCOUNTNAME", "") # Storage account name
account_key=os.getenv("BLOB_ACCOUNT_KEY", "") # Storage account key

blob_datastore = Datastore.register_azure_blob_container(workspace=ws, 
                                                         datastore_name=blob_datastore_name, 
                                                         container_name=container_name, 
                                                         account_name=account_name,
                                                         account_key=account_key)

### Persistent Compute Target 

In [20]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "cpucluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


In [10]:
datastore = Datastore.get(ws, blob_datastore_name)


In [11]:
datastore_paths = [(datastore, 'creditcard.csv')]

In [12]:
dfpath = Dataset.Tabular.from_delimited_files(path=datastore_paths)

In [53]:
datacredit=dfpath.to_pandas_dataframe()

In [11]:
script_folder = os.path.join(os.getcwd(), "creditcardFraud")
os.makedirs(script_folder, exist_ok=True)

### Training script

In [58]:
%%writefile train.py
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report, accuracy_score  
from sklearn.metrics import precision_score, recall_score 
from sklearn.metrics import f1_score, matthews_corrcoef 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from imblearn.under_sampling import RandomUnderSampler
from azureml.core import Workspace, Datastore, Dataset
from azureml.core.run import Run
import joblib
import os
import numpy as np
import pandas as pd

run = Run.get_context()
dataset = run.input_datasets['creditcard']
df=dataset.to_pandas_dataframe()

rob_scaler = RobustScaler()
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))
df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df.drop(['Time','Amount'], axis=1, inplace=True)
scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']

#scaled_amount and scaled_time are added to the starting of the dataframe
df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)
X = df.drop('Class', axis=1)
y = df['Class']
rus = RandomUnderSampler(random_state=42)
X_rs, y_rs = rus.fit_sample(X,y)

df_rs = pd.DataFrame(np.hstack((X_rs,y_rs[:, None])), columns=df.columns)
df_rs.Class = df_rs.Class.astype(int)


def outlier_removal(df,feature, fraud):
    array = df[feature].loc[df['Class'] == fraud].values 
    q25, q75 = np.percentile(array, 25), np.percentile(array,75)
    print('25th percentile: {} | 75th percentile: {}'.format(q25,q75))
    iqr = q75 - q25
    print('Interquartile Range: {}'.format(iqr))
    cutoff = iqr*1.5
    lower_threshold, upper_threshold = q25 - cutoff, q75 + cutoff
    print('Cutoff: {}'.format(cutoff))
    print('Lower Threshold: {} | Upper Threshold: {}'.format(lower_threshold, upper_threshold))
    outliers = [a for a in array if a < lower_threshold or a > upper_threshold]
    print('{} Outliers: {}'.format(feature,outliers))
    print('Number of outliers detected for feature {}: {}'.format(feature,len(outliers)))
    df = df.drop(df[(df[feature] > upper_threshold) | (df[feature] < lower_threshold)].index)
    print('Number of records after outlier removal: {}'.format(len(df)))
    print('-'*117)
    return df

df_rs_out = outlier_removal(df_rs,'V10', 1)
df_rs_out = outlier_removal(df_rs_out,'V2',1)
df_rs_out = outlier_removal(df_rs,'V10', 1)
df_rs_out = outlier_removal(df_rs_out,'V2',1)
 

X = df_rs.drop('Class', axis=1)
y = df_rs['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rfc = DecisionTreeClassifier()
rfc.fit(X_train, y_train) 
# predictions 
yPred = rfc.predict(X_test) 
# calculate accuracy on the prediction
acc = accuracy_score(y_test, yPred) 

run.log('accuracy', np.float(acc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=rfc, filename='outputs/creditcardfraud.pkl')

Writing train.py


In [33]:
import shutil
shutil.copy('utils.py', script_folder)
shutil.copy('train.py', script_folder)

'/mnt/azmnt/code/Users/jyravi/HDFC/creditcardFraud/train.py'

In [34]:
from azureml.train.estimator import Estimator

#script_params = {
    # to mount files referenced by mnist dataset
    #'--data-folder':dfpath.as_named_input('creditcard').as_mount()
    #'--regularization': 0.8
#}

sk_est = Estimator(source_directory='./creditcardFraud',
                   #script_params=script_params,
                   inputs=[dfpath.as_named_input('creditcard')],
                   compute_target=cpu_cluster_name,
                   entry_script='train.py',
                   conda_packages=['scikit-learn'], pip_packages=['imblearn'])

In [35]:
run = experiment.submit(sk_est)
print(run.get_portal_url())

https://ml.azure.com/experiments/creditcardFraudBlob/runs/creditcardFraudBlob_1592838400_7d029e85?wsid=/subscriptions/225d6361-069a-4dfb-9bbb-3ebb42663de0/resourcegroups/Learninggroup/workspaces/amltestcg


In [36]:
run.wait_for_completion(show_output = True)

RunId: creditcardFraudBlob_1592838400_7d029e85
Web View: https://ml.azure.com/experiments/creditcardFraudBlob/runs/creditcardFraudBlob_1592838400_7d029e85?wsid=/subscriptions/225d6361-069a-4dfb-9bbb-3ebb42663de0/resourcegroups/Learninggroup/workspaces/amltestcg

Streaming azureml-logs/70_driver_log.txt

bash: /azureml-envs/azureml_24397b718cb179d4d18728a60467feee/lib/libtinfo.so.5: no version information available (required by bash)
bash: /azureml-envs/azureml_24397b718cb179d4d18728a60467feee/lib/libtinfo.so.5: no version information available (required by bash)
Entering context manager injector. Current time:2020-06-22T15:07:20.617902
Starting the daemon thread to refresh tokens in background for process with pid = 107
Entering Run History Context Manager.
Preparing to call script [ train.py ] with arguments: []
After variable expansion, calling script [ train.py ] with arguments: []

   Time        V1        V2        V3  ...       V27       V28  Amount  Class
0   0.0 -1.359807 -0.07

{'runId': 'creditcardFraudBlob_1592838400_7d029e85',
 'target': 'cpucluster',
 'status': 'Completed',
 'startTimeUtc': '2020-06-22T15:06:53.696479Z',
 'endTimeUtc': '2020-06-22T15:08:20.244758Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'f9a00491-6557-4e1b-bf4f-82ae10aed695',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': 'b7c802bf-b20e-40aa-99b9-753956191e9a'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'creditcard', 'mechanism': 'Direct'}}],
 'runDefinition': {'script': 'train.py',
  'useAbsolutePath': False,
  'arguments': [],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cpucluster',
  'dataReferences': {},
  'data': {'creditcard': {'dataLocation': {'dataset': {'id': 'b7c802bf-b20e-40aa-99b9-753956191e9a',
      'name': None,
      'version': None},
     'dataPath': None

In [37]:
run.get_details()

{'runId': 'creditcardFraudBlob_1592838400_7d029e85',
 'target': 'cpucluster',
 'status': 'Completed',
 'startTimeUtc': '2020-06-22T15:06:53.696479Z',
 'endTimeUtc': '2020-06-22T15:08:20.244758Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'f9a00491-6557-4e1b-bf4f-82ae10aed695',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': 'b7c802bf-b20e-40aa-99b9-753956191e9a'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'creditcard', 'mechanism': 'Direct'}}],
 'runDefinition': {'script': 'train.py',
  'useAbsolutePath': False,
  'arguments': [],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cpucluster',
  'dataReferences': {},
  'data': {'creditcard': {'dataLocation': {'dataset': {'id': 'b7c802bf-b20e-40aa-99b9-753956191e9a',
      'name': None,
      'version': None},
     'dataPath': None

In [38]:
run.get_file_names()

['azureml-logs/55_azureml-execution-tvmps_f8ab8b39c1e5d31b18ebbf7605ee1fd0981d2a1d69156b116728069b14204c1a_d.txt',
 'azureml-logs/65_job_prep-tvmps_f8ab8b39c1e5d31b18ebbf7605ee1fd0981d2a1d69156b116728069b14204c1a_d.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_f8ab8b39c1e5d31b18ebbf7605ee1fd0981d2a1d69156b116728069b14204c1a_d.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'logs/azureml/107_azureml.log',
 'logs/azureml/job_prep_azureml.log',
 'logs/azureml/job_release_azureml.log',
 'outputs/creditcardfraud.pkl']

In [39]:
os.makedirs('./model', exist_ok=True)

for f in run.get_file_names():
    if f.startswith('outputs/creditcardfraud.pkl'):
        output_file_path = os.path.join('./model', f.split('/')[-1])
        print('Downloading from {} to {} ...'.format(f, output_file_path))
        run.download_file(name=f, output_file_path=output_file_path)

Downloading from outputs/creditcardfraud.pkl to ./model/creditcardfraud.pkl ...


In [40]:
run.get_metrics()

{'accuracy': 0.8883248730964467}

## Registering Model

In [36]:
from azureml.core.model import Model

model = Model.register(workspace=ws,model_name='creditcardfraudmodel.pkl', model_path='model/creditcardfraud.pkl')

Registering model creditcardfraudmodel.pkl


## Scoring File

In [37]:
%%writefile score_creditcard.py

import json
import numpy as np
#from sklearn.externals import joblib
from azureml.core.model import Model
from sklearn.preprocessing import RobustScaler
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib

def init():
    global model
    # note here "best_model" is the name of the model registered under the workspace
    # this call should return the path to the model.pkl file on the local disk.
    model_path = Model.get_model_path(model_name='creditcardfraudmodel.pkl')
    # deserialize the model file back into a sklearn model
    model = joblib.load(model_path)
def run(raw_data):
    try:
        data = json.loads(raw_data)['data']
        df =pd.DataFrame.from_dict(data)
        rob_scaler = RobustScaler()
        df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))
        df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
        df.drop(['Time','Amount'], axis=1, inplace=True)
        scaled_amount = df['scaled_amount']
        scaled_time = df['scaled_time']

#scaled_amount and scaled_time are added to the starting of the dataframe
        df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
        df.insert(0, 'scaled_amount', scaled_amount)
        df.insert(1, 'scaled_time', scaled_time)
        
        result = model.predict(df)

        # you can return any data type as long as it is JSON-serializable
        return result.tolist()
    except Exception as e:
        result = str(e)
        return result


Overwriting score_creditcard.py


In [33]:
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig

env = Environment('deploytocloud')
env.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'],pip_packages=['azureml-defaults','joblib'])
inference_config = InferenceConfig(entry_script="score_creditcard.py", environment=env)

In [34]:
from azureml.core.webservice import AciWebservice

aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, 
                                               memory_gb=1, 
                                               tags={'sample name': 'creditcard'}, 
                                               description='This is a great example.')#, location='South India')


#deploy_configuration(cpu_cores=None, memory_gb=None, tags=None, properties=None, description=None, location=None,
#auth_enabled=None, ssl_enabled=None, enable_app_insights=None, 
#ssl_cert_pem_file=None, ssl_key_pem_file=None, ssl_cname=None, dns_name_label=None, primary_key=None, 
#secondary_key=None, collect_model_data=None, cmk_vault_base_url=None, cmk_key_name=None, cmk_key_version=None, 
#vnet_name=None, subnet_name=None)

In [38]:
%%time
from azureml.core.model import Model
from azureml.core.webservice import Webservice

# Create the webservice using all of the precreated configurations and our best model
service = Model.deploy(workspace=ws,
                       name='credit-card-fraud',
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=aciconfig)

# Wait for the service deployment to complete while displaying log output
service.wait_for_deployment(show_output=True)

Running.......................................................................................
Succeeded
ACI service creation operation finished, operation "Succeeded"
CPU times: user 488 ms, sys: 58.2 ms, total: 546 ms
Wall time: 8min 8s


### Testing the service

In [None]:
newdata=datacredit.head(1)
newdata=newdata.drop('Class', axis=1)
bb=newdata.to_dict()
import json
test_Samples=json.dumps({"data": bb})

In [49]:
import json

service = ws.webservices['credit-card-fraud']

service.run(input_data = test_Samples)

[0]

In [57]:
service.scoring_uri

'http://8a83fbde-194b-4538-85e5-fc2051e37d2f.southeastasia.azurecontainer.io/score'