
## Train Model with Titanic Dataset using Hyperdrive experiment

# Connect to your workspace

Let's get started by connecting to the AML workspace leveraging the Azure ML SDK 

In [2]:
import azureml.core
from azureml.core import Workspace
import pandas as pd

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.38.0 to work with mm-hackathon-prep


# Explore Dataset

You're going to use a Python script to train a machine learning model based on the Titanic datset found in your data folder.  

In [11]:
df1 = pd.read_csv('./Data/Train1.csv')
df2 = pd.read_csv('./Data/Train2.csv')
print(df1.shape)
print(df2.shape)
df = df1.merge(df2, on = 'passenger_id', how = 'inner')

df['survived'] = df['survived'].fillna(0)
df['loc']= df['cabin'].apply(lambda x: x[0] if pd.notnull(x) else 'X')
df['age'] = df.groupby(['pclass'])['age'].apply(lambda x: x.fillna(x.median()))
df = df.drop(['name', 'ticket', 'home.dest', 'cabin', 'passenger_id'], axis = 1)


df_features = list(df.columns)
df_features.remove("survived")


from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['embarked'] = df['embarked'].fillna('S')
df['embarked'] = label_encoder.fit_transform(df['embarked'])
df['sex'] = label_encoder.fit_transform(df['sex'])
df['loc'] = label_encoder.fit_transform(df['loc'])

print(df.shape)
df.head()

(917, 6)
(917, 8)
(917, 9)


Unnamed: 0,fare,embarked,survived,pclass,sex,age,sibsp,parch,loc
0,8.05,2,0.0,3.0,1,24.0,0.0,0.0,8
1,21.0,2,0.0,2.0,1,43.0,0.0,1.0,8
2,24.15,2,0.0,3.0,0,10.0,0.0,2.0,8
3,15.5,1,0.0,3.0,1,24.0,0.0,0.0,8
4,211.3375,2,1.0,1.0,0,43.0,0.0,1.0,1


In [12]:
import os
script_folder = os.path.join(os.getcwd(), "hyperparameter_train")
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz4/code/Users/memasanz/titantic-dataset-private/Supplemental_Folders/Additional_Notebooks/hyperparameter_train


In [13]:
df.to_csv('./hyperparameter_train/titanic.csv')
df.head(2)

Unnamed: 0,fare,embarked,survived,pclass,sex,age,sibsp,parch,loc
0,8.05,2,0.0,3.0,1,24.0,0.0,0.0,8
1,21.0,2,0.0,2.0,1,43.0,0.0,1.0,8


In [14]:
from azureml.core import Dataset

#use default datastore retrieved from the workspace through the AML SDK
default_ds = ws.get_default_datastore()


default_ds.upload_files(files=['./hyperparameter_train/titanic.csv'], # Upload the diabetes csv files in /data
                        target_path= 'hyper-drive-data', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)
#Create a tabular dataset from the path on the datastore 
dataset = Dataset.Tabular.from_delimited_files(default_ds.path('hyper-drive-data/titanic.csv'))

# Register the dataset
try:
    tab_data_set = dataset.register(workspace=ws, 
                                name= 'hyper-drive-data-titanic-tabular-dataset',
                                description='Tintanic data',
                                tags = {'format':'csv'},
                                create_new_version=True)
    print('Dataset registered.')
except Exception as ex:
        print(ex)


"datastore.upload_files" is deprecated after version 1.0.69. Please use "FileDatasetFactory.upload_directory" instead. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 1 files
Uploading ./hyperparameter_train/titanic.csv
Uploaded ./hyperparameter_train/titanic.csv, 1 files out of an estimated total of 1
Uploaded 1 files
Dataset registered.


In [15]:
from azureml.core.experiment import Experiment
experiment = Experiment(ws, 'titanic_hyperdrive_exp')

## Create Training Script

Create a folder to hold your training script.  By running the magic command **%%writefile%%** a file will be generated and placed into the **$script_folder**

In [16]:
import os
script_folder = os.path.join(os.getcwd(), "hyperparameter_train")
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz4/code/Users/memasanz/titantic-dataset-private/Supplemental_Folders/Additional_Notebooks/hyperparameter_train


In [17]:
%%writefile $script_folder/training.py

import os
import sys
import argparse
import joblib
import pandas as pd
import numpy as np

from azureml.core import Run, Dataset, Workspace, Experiment

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score,roc_curve

# Calculate model performance metrics
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

def getRuntimeArgs():
    parser = argparse.ArgumentParser()
    
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-data", type=str)
    parser.add_argument("--penalty_term", type=str, dest = 'penalty_term', default = 'l2', help = 'penalty term')
    parser.add_argument('--C', type=float, dest='C', default=0.1, help='learning rate')
    args = parser.parse_args()
    return args

def buildpreprocessorpipeline(X_raw):
    categorical_features = X_raw.select_dtypes(include=['object']).columns
    numeric_features = X_raw.select_dtypes(include=['float','int64']).columns

    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
                                              ('onehotencoder', OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore'))])
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric', numeric_transformer, numeric_features),
            ('categorical', categorical_transformer, categorical_features)
        ], remainder="drop")
    
    return preprocessor

def model_train(LABEL, df, run, penalty_term, C_value):  
    y_raw = df[LABEL]
    X_raw = df.drop([LABEL], axis=1)
    
     # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.3, random_state=0)
    
    lg = LogisticRegression(penalty=penalty_term, C=C_value, solver='liblinear')
    preprocessor = buildpreprocessorpipeline(X_train)
    
    #estimator instance
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', lg)])

    model = clf.fit(X_train, y_train)
    
    
    # calculate AUC
    y_scores = model.predict_proba(X_test)
    auc = roc_auc_score(y_test,y_scores[:,1])
    print('AUC: ' + str(auc))
    run.log('AUC', np.float(auc))

    
    # calculate test accuracy
    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)
    print('Accuracy:', acc)
    run.log('Accuracy', np.float(acc))

    # plot ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
    fig = plt.figure(figsize=(6, 4))
    # Plot the diagonal 50% line
    plt.plot([0, 1], [0, 1], 'k--')
    # Plot the FPR and TPR achieved by our model
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    run.log_image(name = "ROC", plot = fig)
    plt.show()

    # plot confusion matrix
    # Generate confusion matrix
    cmatrix = confusion_matrix(y_test, y_hat)
    cmatrix_json = {
        "schema_type": "confusion_matrix",
           "schema_version": "v1",
           "data": {
               "class_labels": ["0", "1"],
               "matrix": [
                   [int(x) for x in cmatrix[0]],
                   [int(x) for x in cmatrix[1]]
               ]
           }
    }
    
    run.log_confusion_matrix('ConfusionMatrix_Test', cmatrix_json)

    return model, auc, acc
    # Save the trained model
    
    
def main():
    # Create an Azure ML experiment in your workspace
    args = getRuntimeArgs()
    
    run = Run.get_context()
    run.log('penalty_term', args.penalty_term)
    run.log('C', np.float(args.C))
    dataset_dir = './dataset/'
    os.makedirs(dataset_dir, exist_ok=True)
    ws = run.experiment.workspace
    print(ws)
    

    print("Loading Data...")
    #dataset = Dataset.get_by_id(ws, id=args.input_data)
    dataset = run.input_datasets['titanic']
    # Load a TabularDataset & save into pandas DataFrame
    df = dataset.to_pandas_dataframe()
    
    print(df.head(5))
 
    model, auc, acc = model_train('Survived', df, run, args.penalty_term, args.C)
    
    os.makedirs('outputs', exist_ok=True)
    
    
    model_file = os.path.join('outputs', 'titanic_model.pkl')
    joblib.dump(value=model, filename=model_file)
    
    # Register the model
    print('Registering model...')
    Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'titanic_model',
               tags={'Training context':'Compute'},
               properties={'AUC': np.float(auc), 'Accuracy': np.float(acc)})

    run.complete()

if __name__ == "__main__":
    main()

Writing /mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz4/code/Users/memasanz/titantic-dataset-private/Supplemental_Folders/Additional_Notebooks/hyperparameter_train/training.py


## Define an Environment

In [18]:
%%writefile $script_folder/experiment_env.yml
name: experiment_env
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Writing /mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz4/code/Users/memasanz/titantic-dataset-private/Supplemental_Folders/Additional_Notebooks/hyperparameter_train/experiment_env.yml


Create an environment from the conda specification file

In [19]:
from azureml.core import Environment

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification('experiment-env', script_folder + "/experiment_env.yml")

# Let Azure ML manage dependencies
experiment_env.python.user_managed_dependencies = False 

# Print the environment details
print(experiment_env.name, 'defined.')
print(experiment_env.python.conda_dependencies.serialize_to_string())

experiment-env defined.
name: experiment_env
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow



## Create a compute cluster

Previously, the model was trained on a compute instance, by creating a compute cluster, we can submit the job to the compute cluster

In [20]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "aml-cluster"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


In [23]:
import azureml.core.runconfig
from azureml.core import Environment, Experiment
from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice
from azureml.core import ScriptRunConfig
from azureml.widgets import RunDetails

# Get the training dataset
titanic_ds = ws.datasets.get('Titanic-tabular-dataset')

# Create a script config
script_config = ScriptRunConfig(source_directory=script_folder,
                                script='training.py',
                                arguments=['--input-data', tab_data_set.as_named_input('titanic')], # Reference to dataset
                                environment=experiment_env,
                                compute_target=cluster_name)

params = GridParameterSampling(
    {
        # Hyperdrive will try 8 combinations, adding these as script arguments
        '--penalty_term': choice('l2', 'l1'),
        '--C': choice(.01, .1, 1,10)
    }
)

hyperdrive = HyperDriveConfig(run_config=script_config, 
                          hyperparameter_sampling=params, 
                          policy=None, # No early stopping policy
                          primary_metric_name='AUC', # Find the highest AUC metric
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                          max_total_runs=8, # Restict the experiment to 6 iterations
                          max_concurrent_runs=2) # Run up to 2 iterations in parallel

# Run the experiment
experiment = Experiment(workspace=ws, name='titanic-hyperdrive')
run = experiment.submit(config=hyperdrive)

# Show the status in the notebook as the experiment runs
RunDetails(run).show()


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'â€¦

In [24]:
cluster_state = training_cluster.get_status()
print(cluster_state.allocation_state, cluster_state.current_node_count)

Steady 1


In [None]:
run.wait_for_completion()

## Find the best run

In [None]:
printchild_run = 0
if printchild_run == 1:
    print("child run information")
    for child_run in run.get_children_sorted_by_primary_metric():
        print(child_run)

# Get the best run
best_run = run.get_best_run_by_primary_metric()
#get the metrics for the best run
best_run_metrics = best_run.get_metrics()

#review the metics for the best run
script_arguments = best_run.get_details() ['runDefinition']['arguments']
print('Best Run Id: ', best_run.id)
print(' -AUC:', best_run_metrics['AUC'])
print(' -Accuracy:', best_run_metrics['Accuracy'])
print(' -Arguments:',script_arguments)

## Register the Best Model

In [None]:
from azureml.core import Model

# Register model
best_run.register_model(model_path='outputs/titanic_model.pkl', model_name='titanic_model',
                        tags={'Training context':'Hyperdrive'},
                        properties={'AUC': best_run_metrics['AUC'], 'Accuracy': best_run_metrics['Accuracy']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')