In [None]:
#Make data accessible to team of data scientist
#Connect to workspace
import azureml.core
from auzreml.core import Workspace

#Load workspace from saved config file
ws=Workspace.from_config()
print('Ready to use AzureML {} to work with {}'.format(azureml.core.VERSION,ws.name))

In [None]:
#Work with datastore
#In Azure ML, datastores are references to storage locations, such as Azure Storage blob containers. Every workspace has a default datastore - usually the Azure storage blob container that was created with the workspace. If you need to work with data that is stored in different locations, 
#you can add custom datastores to your workspace and set any of them to be the default.

#View datastore in current workspace
default_ds=ws.get_default_datastore()#datastores could be viewed and mangaed thourgh ml studio

#Upload to datastore
#you can upload files from your local file system to a datastore so that it
#will be accessible to experiments running in the workspace, regardless of where the experiment script is actually being run
default_ds.upload_files(files=['./data/diabetes.csv', './data/diabetes2.csv'], # Upload the diabetes csv files in /data
                       target_path='diabetes-data/', # Put it in a folder path in the datastore
                       overwrite=True, # Replace existing files of the same name
                       show_progress=True)

#Train a model from datastore
#The following code gets a reference to the diabetes-data folder where you uploaded the diabetes CSV files, and specifically configures the data reference for download - 
#other words, it can be used to download the contents of the folder to the compute context where the data reference is being used
data_ref = default_ds.path('diabetes-data').as_download(path_on_compute='diabetes_data')
print(data_ref)

In [None]:
#Create a folder for the experiment files and write the experiment script
import os 
#Create a folder for experiment files
experiment_folder='diabetes_training_from_datastore'
os.makedirs(experiment_folder,exist_ok=True)
print(experiment_folder, 'folder created')

%%writefile $experiment_folder/diabetes_training.py
import os 
import argparse
from azureml.core import Run
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

#Get paramaters
parser=argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder reference')
args = parser.parse_args()
reg = args.reg_rate

#Get the experiment run context
run=Run.get_context()

#Load diabetes data from data reference
data_folder=args.data_folder
print('Lodingd data from', data_folder)
#Load all files and concatenate contents into a single dataframe
all_files=os.listdir(data_folder)
diabetes=pd.concat((pd.read_csv(os.path.join(data_folder,csv_file)) for csv_file in all_files))

#Seperate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

#Split data in to training data and test data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=0)

#Train a logistic regression model
print('Training logistic regression model with a regularization rate of ', reg)
run.log('Regularization Rate',np.float(reg))
model=LogisticRegression(C=1/reg,solver='liblinear').fit(X_train,y_train)

#Calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

#Calculate auc
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

# Work with dataset
you can read data directly from datastores, azure ml provides a further abstraction. Dataset is a reference to a sepcific set of data. Datasets can be 
tabular or file-based.

## Create a tabular dataset

In [None]:
from azureml.core import Dataset
#get the default datastore
default_ds=ws.get_default_datastore()

#Create a tabular dataset from the path on the datastore(may take a while)
tab_data_set=Dataset.Tabular.from_delimited_files(path=(default_ds,'diabetes-data/*.csv'))

#Display the first 20 rows as a Pandas dataframe
