# Breast Cancer - Training  Models in Azure ML

In [18]:
import argparse
import os
import sklearn
import pandas as pd 
import numpy as np
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from azureml.core import Run, Dataset
from sklearn.preprocessing import LabelEncoder

In [19]:
from azureml.core import  Workspace
from azureml.core.authentication import InteractiveLoginAuthentication
interactive_auth = InteractiveLoginAuthentication(tenant_id="9ce70869-60db-44fd-abe8-d2767077fc8f")

ws = Workspace.from_config()


In [20]:
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: cselscdhazureml
Azure region: eastus2
Subscription id: 320d8d57-c87c-4434-827f-59ee7d86687a
Resource group: csels-cdh-dev


In [21]:
# https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
df = pd.read_csv('./data/cancer_data.csv',header=0,sep=',')
df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [22]:
len(df.columns)


32

In [23]:
y = df['diagnosis'].astype('category')
X = df.drop('diagnosis',axis=1)

lbl_encoder = LabelEncoder()
y_encode = lbl_encoder.fit_transform(y)

print("cols:",X.columns)
print("X shape", X.shape)
print("encoder:", lbl_encoder.classes_)
print("y encode:", y_encode.shape)

cols: Index(['id', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')
X shape (569, 31)
encoder: ['B' 'M']
y encode: (569,)


In [24]:
x_train,x_test,y_train,y_test = train_test_split(X,y_encode,train_size=0.75,random_state=42,stratify =y_encode)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

(426, 31)
(426,)
(143, 31)
(143,)


In [25]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [26]:
clf = LogisticRegression()
clf.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
accuracy_score(y_train,clf.predict(x_train))

0.9882629107981221

In [28]:
accuracy_score(y_test,clf.predict(x_test))

0.965034965034965

In [29]:
rf = RandomForestClassifier(n_estimators=40,max_depth=100,max_features='auto',min_samples_leaf=3)
rf.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=100, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [32]:
print(f"Training accuracy :{accuracy_score(y_train,rf.predict(x_train)) } , Test accucacy: {accuracy_score(y_test,rf.predict(x_test))}")

Training accuracy :0.9906103286384976 , Test accucacy: 0.958041958041958


In [34]:
record = x_train[0].tolist()
print(record)
instance = pd.DataFrame(record).transpose()
rf.predict(instance)

[0.393845668409139, -0.7457496952627328, -0.22189398515428446, -0.7676382304726264, -0.6999246324662508, -0.17745016145311038, -0.81484548808207, -0.770581761525873, -0.7189314051409194, 0.07367558407930598, -0.4665409990624931, -0.06425072325003647, -0.667435753669589, -0.10209930505592255, -0.2928749021935234, -0.18393920631817678, -0.8204798568998558, -0.6065568523529427, -0.395651155530143, 0.3134950272756633, -0.8678658791041564, -0.6897420117050609, -0.5598110994362666, -0.723009965205552, -0.6542518616646612, -0.5936861839456196, -0.9574791012332506, -0.9270503937797329, -0.7552651134179409, -0.2651797994882257, -1.0640128515011344]


array([0])

In [35]:
instance2= x_test[x_test['id']==92751]
rf.predict(instance2)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [36]:
from azureml.core import Workspace, Experiment, Run
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import ScriptRunConfig, Environment
from azureml.widgets import RunDetails

In [37]:
exp = Experiment(workspace=ws, name="RF-BreastCancer")

In [38]:
%%writefile conda_dependencies.yml

channels:
- conda-forge
dependencies:
- python=3.8
- pip:
  - azureml-defaults
  - matplotlib
  - pandas
  - argparse
  - joblib
  - scikit-learn

Overwriting conda_dependencies.yml


In [39]:
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')
# Specify a GPU base image
sklearn_env.docker.enabled = True

'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


### Create Compute Target

In [40]:
clustername = 'StandardDS12CPU'
is_new_cluster = False
try:
    gpu_cluster = ComputeTarget(workspace = ws,name= clustername)
    print("Find the existing cluster")
except ComputeTargetException:
    print("Cluster not find - Creating cluster")
    is_new_cluster = True
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    gpu_cluster = ComputeTarget.create(ws, clustername, compute_config)

gpu_cluster.wait_for_completion(show_output=True)

Find the existing cluster
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


### Load data in data store 

In [41]:
data_store = ws.get_default_datastore()
data_store.upload(src_dir='./data',target_path='cancer_data',overwrite=True,show_progress=True)

"Datastore.upload" is deprecated after version 1.0.69. Please use "Dataset.File.upload_directory" to upload your files             from a local directory and create FileDataset in single method call. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 5 files
Uploading ./data/.amlignore
Uploaded ./data/.amlignore, 1 files out of an estimated total of 5
Uploading ./data/.amlignore.amltmp
Uploaded ./data/.amlignore.amltmp, 2 files out of an estimated total of 5
Uploading ./data/cancer_data.csv
Uploaded ./data/cancer_data.csv, 3 files out of an estimated total of 5
Uploading ./data/test/test.csv
Uploaded ./data/test/test.csv, 4 files out of an estimated total of 5
Uploading ./data/train/train.csv
Uploaded ./data/train/train.csv, 5 files out of an estimated total of 5
Uploaded 5 files


$AZUREML_DATAREFERENCE_031415253ffb4871851e02915fe192a1

### Training scritps

In [42]:
%%writefile ./scripts/train.py

import argparse
import os
import sklearn
import pandas as pd 
import numpy as np
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from azureml.core import Run, Dataset
from sklearn.preprocessing import LabelEncoder
import joblib

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--data_folder",type=str,default='./data')

    args = parser.parse_args()
    folder = args.data_folder

    run = Run.get_context()
    ws = run.experiment.workspace
    ds_tr = ws.get_default_datastore()
    ds = Dataset.Tabular.from_delimited_files(path=ds_tr.path('cancer_data/cancer_data.csv'))


    #df = pd.read_csv(os.path.join(folder,'cancer_data.csv'))
    df = ds.to_pandas_dataframe()
    y = df['diagnosis'].astype('category')
    X = df.drop('diagnosis',axis=1)

    lbl_encoder = LabelEncoder()
    y_encode = lbl_encoder.fit_transform(y)

    print("cols:",X.columns)
    print("X shape", X.shape)
    print("encoder:", lbl_encoder.classes_)
    print("y encode:", y_encode.shape)

    x_train,x_test,y_train,y_test = train_test_split(X,y_encode,train_size=0.75,random_state=42,stratify =y_encode)

    print(x_train.shape)
    print(y_train.shape)

    print(x_test.shape)
    print(y_test.shape)

    rf = RandomForestClassifier(n_estimators=40,max_depth=100,max_features='auto',min_samples_leaf=3)
    rf.fit(x_train,y_train)

    accuracy = accuracy_score(y_test,rf.predict(x_test))
    run.log("accuracy",accuracy)

    f1 = f1_score(y_test,rf.predict(x_test))
    run.log("f1_score",f1)


    # Write the model to file.
    model_path = "./outputs/cancer_model.pkl"
    os.makedirs("outputs", exist_ok=True)
    print('Saving the model to {}'.format(model_path))
    joblib.dump(rf, model_path)

if __name__ == '__main__':
    main()


Overwriting ./scripts/train.py


In [43]:
!python ./scripts/train.py

Traceback (most recent call last):
  File "./scripts/train.py", line 67, in <module>
    main()
  File "./scripts/train.py", line 24, in main
    ws = run.experiment.workspace
AttributeError: '_OfflineRun' object has no attribute 'experiment'


In [44]:
estimator = ScriptRunConfig(source_directory='./scripts',
                      script='train.py',
                      compute_target=gpu_cluster,
                      environment=sklearn_env)

In [51]:
# TODO: Submit your experiment
run = exp.submit(estimator)

In [52]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [54]:
from azureml.core.model import Model as AMLModel

In [60]:
AMLModel.register(workspace=ws,model_name='breast-cancer', model_path="./outputs/cancer_model.pkl")

Registering model breast-cancer


Model(workspace=Workspace.create(name='cselscdhazureml', subscription_id='320d8d57-c87c-4434-827f-59ee7d86687a', resource_group='csels-cdh-dev'), name=breast-cancer, id=breast-cancer:1, version=1, tags={}, properties={})

In [49]:
run.register_model(model_name='breast-cancer',model_path='./outputs/cancer_model.pkl')

ModelPathNotFoundException: ModelPathNotFoundException:
	Message: Could not locate the provided model_path outputs/cancer_model.pkl in the set of files uploaded to the run: []
                See https://aka.ms/run-logging for more details.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Could not locate the provided model_path outputs/cancer_model.pkl in the set of files uploaded to the run: []\n                See https://aka.ms/run-logging for more details."
    }
}