# Tutorial: Traning diabetes model from parquet files using DASK dataframe

In this tutorial we explore DASK dataframe and its Api to train a diabetes model.

This tutorial includes the following:
- Convert a csv file into parquet files, partition on a column.
- We train a model from the parquet files.

## Install DASK dataframe using pip

In [None]:
pip install dask

## Create a folder to store the experiment files 

In [None]:
import os

# Create a folder for the experiment files
training_folder = 'diabetes-training'
os.makedirs(training_folder, exist_ok=True)

## Copy the csv file in the traning folder and convert into parquet files

Note: We can skip the copy part and directly convert the file into parquet files. Provide the correct path for the files.

In [None]:
# convert a csv file into a parquet file

import dask.dataframe as dd
import os, shutil
 
# Copy the data file into the experiment folder
shutil.copy('diabetes.csv', os.path.join(training_folder, "diabetes.csv"))
csv_path = os.path.join(os.getcwd(), training_folder)

# Read the csv file in dask data frame and convert into parquet files  
diabetes_ddf =  dd.read_csv(csv_path+"/diabetes.csv")
diabetes_ddf.to_parquet(csv_path, write_index=False, partition_on='Diabetic')

## Create workspace
If the workspace already exists connect to it

In [None]:
ws = Workspace.create(
    name = "Your Workspace Name",
    subscription_id = "Your Subsription Id",
    resource_group = "Your Resource Group", 
    location = "Your location",  # e.g "westus"
    exist_ok = True,
    show_output = True)

ws.write_config()

In [None]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

## Upload the taining folder in the default datastore 

In [None]:
# Get the default datastore
default_ds = ws.get_default_datastore()

# Enumerate all datastores, indicating which is the default
for ds_name in ws.datastores:
    print(ds_name, "- Default =", ds_name == default_ds.name)

In [None]:
# uploading file to workspace blob store

from azureml.core import Dataset
from azureml.data.datapath import DataPath

Dataset.File.upload_directory(src_dir='diabetes-training',
                              target=DataPath(default_ds, 'dask-diabetes-training/')
                              )

## Create a parquet files dataset & register in the workspace

In [None]:
from azureml.core import Dataset
from azureml.data.datapath import DataPath

#Create a file dataset from the path on the datastore (this may take a short while)
file_data_set = Dataset.File.from_files(path=(default_ds, 'dask-diabetes-training/*/*.parquet'))

# Get the files in the dataset
for file_path in file_data_set.to_path():
    print(file_path)

In [None]:
# Register the file dataset
try:
    file_data_set = file_data_set.register(workspace=ws,
                                            name='parquet diabetes file dataset',
                                            description='parquet files',
                                            tags = {'format':'parquet'},
                                            create_new_version=True)
except Exception as ex:
    print(ex)

Print all workspce datasets with the following command

In [None]:
print("Datasets:")
for dataset_name in list(ws.datasets.keys()):
    dataset = Dataset.get_by_name(ws, dataset_name)
    print("\t", dataset.name, 'version', dataset.version)

## Train a model from a parquet files dataset
- A script that trains a classification model by using a parquet files dataset that is passed to is as an input.

In [None]:
%%writefile $training_folder/diabetes_parquet_training.py
# Import libraries
from azureml.core import Run
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import dask.dataframe as dd
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--input-data', type=str, dest='dataset_folder', help='data mount point')
args = parser.parse_args()

run = Run.get_context()
# load the diabetes dataset
print("Loading Data...")

data_path = args.dataset_folder
diabetes = dd.read_parquet(path=[data_path+'/Diabetic=*/*.parquet'], engine='pyarrow')

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].map_partitions(np.asarray)

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X.compute(), y.compute(), test_size=0.30, random_state=0)

# Set regularization hyperparameter
reg = 0.01

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# Save the trained model in the outputs folder
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/diabetes_parquet_model.pkl')

run.complete()

## Run the training script as an experiment
The conda environment is built on-demand the first time the experiment is run, and cached for future runs that use the same configuration; so the first run will take a little longer.

In [None]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.runconfig import DockerConfiguration
from azureml.widgets import RunDetails

# Create a Python environment for the experiment (from a .yml file)
env = Environment.from_conda_specification("dask_frame_env", "environment.yml")
diabetes_parquet_ds = ws.datasets.get("parquet diabetes file dataset")
# Create a script config
script_config = ScriptRunConfig(source_directory=training_folder,
                                script='diabetes_parquet_training.py',
                                arguments = ['--input-data', diabetes_parquet_ds.as_download(path_on_compute="/tmp/training_files")], # Reference to dataset location,
                                environment=env,
                                ) 


In [None]:
# remove the files, if they are already exist 
import shutil
download_file_path = "/tmp/training_files"
if os.path.exists(download_file_path):
    shutil.rmtree(download_file_path)

# submit the experiment run
experiment_name = 'parquet-train-diabetes'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)

# Show the running experiment run in the notebook widget
RunDetails(run).show()

# Block until the experiment run has completed
run.wait_for_completion()

## Register the trained model
Note that the outputs of the experiment include the trained model file (**diabetes_parquet_model.pkl**). We can register this model in your AML workspace, making it possible to track model versions and retrieve them later.

In [None]:
# Register the model
from azureml.core import Model

run.register_model(model_path='outputs/diabetes_parquet_model.pkl', model_name='diabetes_parquet_model',
                   properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')