In [1]:
from azureml.core import Workspace

ws = Workspace.from_config()

In [9]:
# To view datastores
for datastore_name in ws.datastores:
    print(datastore_name)

workspaceworkingdirectory
workspaceartifactstore
workspacefilestore
workspaceblobstore


# Getting all datastores in our workspace

In [11]:
from azureml.core import Datastore

for datastore_name in ws.datastores:
    datastore = Datastore.get(workspace = ws, datastore_name = datastore_name)
    print(datastore)

{
  "name": "workspaceworkingdirectory",
  "container_name": "code-391ff5ac-6576-460f-ba4d-7e03433c68b6",
  "account_name": "amlworkspace3186208600",
  "protocol": "https",
  "endpoint": "core.windows.net"
}
{
  "name": "workspaceartifactstore",
  "container_name": "azureml",
  "account_name": "amlworkspace3186208600",
  "protocol": "https",
  "endpoint": "core.windows.net"
}
{
  "name": "workspacefilestore",
  "container_name": "azureml-filestore-26e252cd-46d7-46d6-8b23-65a75e29f068",
  "account_name": "amlworkspace3186208600",
  "protocol": "https",
  "endpoint": "core.windows.net"
}
{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-26e252cd-46d7-46d6-8b23-65a75e29f068",
  "account_name": "amlworkspace3186208600",
  "protocol": "https",
  "endpoint": "core.windows.net"
}


In [18]:
for datastore_name in ws.datastores:
    datastore = Datastore.get(workspace = ws, datastore_name = datastore_name)
    print("Name: ", datastore.name, '\t', "Type: ", datastore.datastore_type)

Name:  workspaceworkingdirectory 	 Type:  AzureFile
Name:  workspaceartifactstore 	 Type:  AzureBlob
Name:  workspacefilestore 	 Type:  AzureFile
Name:  workspaceblobstore 	 Type:  AzureBlob


# Getting default datastore

In [19]:
# To view default datastore
default_datastore = ws.get_default_datastore()

print(default_datastore)

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-26e252cd-46d7-46d6-8b23-65a75e29f068",
  "account_name": "amlworkspace3186208600",
  "protocol": "https",
  "endpoint": "core.windows.net"
}


# Uploading files from directory to datastore

In [21]:
# uploading data to datastore
from azureml.core import Dataset
from azureml.data.datapath import DataPath

Dataset.File.upload_directory(src_dir = 'data', target = DataPath(default_datastore, path_on_datastore = 'diabetes-data/'))

Validating arguments.
Arguments validated.
Uploading file to diabetes-data/
Uploading an estimated of 3 files
Uploading data/.amlignore
Uploaded data/.amlignore, 1 files out of an estimated total of 3
Uploading data/.amlignore.amltmp
Uploaded data/.amlignore.amltmp, 2 files out of an estimated total of 3
Uploading data/diabetes.csv
Uploaded data/diabetes.csv, 3 files out of an estimated total of 3
Uploaded 3 files
Creating new dataset


{
  "source": [
    "('workspaceblobstore', '/diabetes-data/')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ]
}

# Creating a dataset(or tabular dataset) from taking all files in a datastore

In [22]:
# Creating a Tabular Dataset
from azureml.core import Dataset
default_datastore = ws.get_default_datastore()

# Creating a tabular dataset from the files in a datastore
tabular_dataset = Dataset.Tabular.from_delimited_files(path = (default_datastore, 'diabetes-data/*.csv')) # (datastore, files_path_in_datastore)

# First 10 rows of data
tabular_dataset.to_pandas_dataframe().head(10)

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
5,1619297,0,82,92,9,253,19.72416,0.103424,26,0
6,1660149,0,133,47,19,227,21.941357,0.17416,21,0
7,1458769,0,67,87,43,36,18.277723,0.236165,26,0
8,1201647,8,80,95,33,24,26.624929,0.443947,53,1
9,1403912,1,72,31,40,42,36.889576,0.103944,26,0


# Creating a File Dataset

In [24]:
file_dataset = Dataset.File.from_files(path = (ws.get_default_datastore(), 'diabetes-data/*.csv'))
print(file_dataset)

FileDataset
{
  "source": [
    "('workspaceblobstore', 'diabetes-data/*.csv')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ]
}


In [26]:
for file in file_dataset.to_path():
    print(file)

/diabetes.csv


# Registering dataset to make them available to any experiment that is being run in the workspace

##### Registered datasets can be found in "Data" of "Assets" Library in Azure Machine Learning Studio

## Tabular Dataset

In [31]:
try:
    tabular_dataset = tabular_dataset.register(workspace = ws, name = 'tabular_dataset', description = "This data is about diabetes", 
                                                tags = {'format': 'CSV'}, create_new_version = True)
except Exception as ex:
    print(ex)


## File Dataset

In [33]:
try:
    file_dataset = file_dataset.register(workspace = ws, name = 'file_dataset', 
                                        tags = {'format': 'CSV'}, create_new_version = True)
except Exception as ex:
    print(ex)

# Retreiving Tabular data from "Data" in "Assets" Library in Azure ML Studio

In [36]:
tabular_dataset_in_data_assest_library = Dataset.get_by_name(workspace = ws, name = 'tabular_dataset')
tabular_dataset_in_data_assest_library.to_pandas_dataframe().head(10)

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
5,1619297,0,82,92,9,253,19.72416,0.103424,26,0
6,1660149,0,133,47,19,227,21.941357,0.17416,21,0
7,1458769,0,67,87,43,36,18.277723,0.236165,26,0
8,1201647,8,80,95,33,24,26.624929,0.443947,53,1
9,1403912,1,72,31,40,42,36.889576,0.103944,26,0


# Retreiving File data from "Data" in "Assets" Library in Azure ML Studio

In [38]:
file_dataset_in_data_assets_library = Dataset.get_by_name(workspace = ws, name = 'file_dataset')
file_dataset_in_data_assets_library

{
  "source": [
    "('workspaceblobstore', 'diabetes-data/*.csv')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "2cff7a23-aa5d-4fd7-88a0-4e178f8873fe",
    "name": "file_dataset",
    "version": 1,
    "tags": {
      "format": "CSV"
    },
    "workspace": "Workspace.create(name='aml-workspace', subscription_id='3571f8dc-3527-4993-9d2b-ac0812d807fd', resource_group='aml-resources')"
  }
}

# Training a model from Tabular Dataset

In [39]:
import os

# Create a folder for experiment files
experiment_folder = 'diabetes_training_from_tab_datset'
os.makedirs(experiment_folder, exist_ok = True)

print(experiment_folder, '-> Folder Created')

diabetes_training_from_tab_datset -> Folder Created


In [41]:
%%writefile $experiment_folder/training_script.py

import os 
import argparse
import pandas as pd 
import numpy as np
import joblib 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve 

from azureml.core import Run, Dataset

# Getting the Script aruguments
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type = float, dest = 'reg_rate', default = 0.01, help = 'Regularization rate')
parser.add_argument('--input-data', type = str, dest = 'training_data_id', help = 'Training Data')
args = parser.parse_args()

# Setting regularization rate (Passed as an argument from the ScriptRunConfig)
reg = args.reg_rate

# Get the Experiment run context
run = Run.get_context()

# Get the training dataset
dataset = run.input_datasets['training_data'].to_pandas_dataframe() # (as_name_input = 'training_data' in ScriptRunConfig File)

# Seperate features and labels
X, y = dataset[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, dataset['Diabetic'].values

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, randome_state = 198)

# Train logistic regression model
run.log(name = 'Regularization rate', value = reg)
classifier = LogisticRegression(C = 1/reg)
classifier.fit(X_train, y_train)

# Calculate accuracy 
y_hat = classifier.predict(X_test)

accuracy = np.average(y_hat == y_test)
print('Accuracy: ', accuracy)
run.log(name = 'Accuracy', value = accuracy)

# Calculate AUC
y_scores = classifier.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])

print('Area Under the Curve: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok = True)

# File saved in the outputs folder is automatically uploaded into experiments record
joblib.dump(value = classifier, filename = 'outputs/diabetes-classifier.pkl')

run.complete()

Writing diabetes_training_from_tab_datset/training_script.py


In [61]:
from azureml.core import Experiment, ScriptRunConfig
from azureml.widgets import RunDetails

# Get the training dataset
dataset = ws.datasets.get('tabular_dataset') # ws.datsets -> dataset from "Data" in "Assets" Library

# Create a ScriptRunConfig
script_config = ScriptRunConfig(source_directory = experiment_folder, script = 'training_script.py',
                                arguments = ['--regularization', 0.1,
                                             '--input-data', dataset.as_named_input('training_data')]
                                             )

# Submitting the experiment
experiment_name = 'mslearn-train-diabetes'
experiment = Experiment(workspace = ws, name = experiment_name)

run = experiment.submit(config = script_config)

RunDetails(run).show()
# run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…