# 熱中症患者数予測
#### 気象データから熱中症患者数を予測する回帰モデルを作成する

### 01.training.ipynb
学習データの読み込み・加工・モデルの学習

### 【実行必須】 前章の復習`00.setup.ipynb`

In [2]:
import azureml
from azureml.core import Workspace, Run
from azureml.core import Experiment
import os
import pandas as pd

print("Azure ML SDK Version: ", azureml.core.VERSION)

ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, ws.location, sep = '\t')

experiment_name = 'Heatstroke_patient_prediction'
exp = Experiment(workspace=ws, name=experiment_name)

project_folder = './sample_projects/Heatstroke_patient_prediction'
output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace Name'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
output['Experiment Name'] = exp.name
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

Azure ML SDK Version:  1.0.57
azmls	japaneast	data-hack	japaneast


Unnamed: 0,Unnamed: 1
SDK version,1.0.57
Subscription ID,f8e38435-13b5-4b97-96c4-39b3bc918c54
Workspace Name,azmls
Resource Group,data-hack
Location,japaneast
Project Directory,./sample_projects/Heatstroke_patient_prediction
Experiment Name,Heatstroke_patient_prediction


## 1. トレーニング実行用の ML Compute 作成

In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "cpucluster"

try:
    cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D4_V2', #'Standard_NC6'
                                                           min_nodes=0,
                                                           max_nodes=8)
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)

cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


## 2. トレーニングデータをAzure Blob Storageにアップロード

In [4]:
import pandas as pd
df = pd.read_csv('./data/Heatstroke_patient_prediction_train_data.csv')
df.head(10)

Unnamed: 0,年月日,搬送人員（計）,最高気温(℃),平均気温(℃),最低気温(℃),日照時間(時間),平均風速(m/s),平均雲量(10分比),平均湿度(％),降水量の合計(mm),...,m前日最高気温との差,m前日平均気温との差,m前日最低気温との差,m最高気温移動平均(5日間),m平均気温移動平均(5日間),m体感温度移動平均(5日間),m不快指数移動平均(5日間),m前日の搬送人数,m搬送人数移動平均(5日間),年
0,7/2/2008 12:00:00 AM,4,26.9,23.7,21.1,8.3,3.1,9.5,70,0.0,...,1.4,1.9,2.5,24.82,21.9,19.750477,69.1543,0,0.0,2008
1,7/3/2008 12:00:00 AM,1,27.3,24.3,22.1,1.4,5.0,7.8,75,0.0,...,0.4,0.6,1.0,25.14,22.32,20.025609,69.813896,4,2.0,2008
2,7/4/2008 12:00:00 AM,14,31.6,26.5,23.1,8.7,3.7,7.8,77,16.0,...,4.3,2.2,1.0,25.38,22.5,20.171603,70.182006,1,1.666667,2008
3,7/5/2008 12:00:00 AM,26,31.4,27.5,24.9,8.0,2.3,7.8,72,0.0,...,-0.2,1.0,1.8,27.2,23.6,22.279985,71.768756,14,4.75,2008
4,7/6/2008 12:00:00 AM,15,30.7,27.0,24.7,4.2,2.5,10.0,76,0.0,...,-0.7,-0.5,-0.2,28.54,24.76,23.812901,73.535204,26,9.0,2008
5,7/7/2008 12:00:00 AM,1,26.9,25.1,24.0,0.0,2.1,10.0,81,15.0,...,-3.8,-1.9,-0.7,29.58,25.8,25.12746,75.23292,15,12.0,2008
6,7/8/2008 12:00:00 AM,1,26.5,24.1,22.8,0.3,3.0,10.0,80,1.0,...,-0.4,-1.0,-1.2,29.58,26.08,25.375582,75.885838,1,11.4,2008
7,7/9/2008 12:00:00 AM,2,26.1,24.3,22.7,1.2,2.5,10.0,68,0.0,...,-0.4,0.2,-0.1,29.42,26.04,25.393262,75.919328,1,11.4,2008
8,7/10/2008 12:00:00 AM,2,27.8,24.6,22.1,2.5,2.3,9.0,70,0.0,...,1.7,0.3,-0.6,28.32,25.6,24.053976,75.05189,2,9.0,2008
9,7/11/2008 12:00:00 AM,6,29.4,26.0,23.5,5.5,3.1,8.8,71,0.0,...,1.6,1.4,1.4,27.6,25.02,23.196735,74.12845,2,4.2,2008


In [5]:
ds = ws.get_default_datastore()
print(ds.name, ds.datastore_type, ds.account_name, ds.container_name)
ds.upload_files(['./data/Heatstroke_patient_prediction_train_data.csv'],overwrite=True)

workspaceblobstore AzureBlob azmls1405322897 azureml-blobstore-5e66db71-88c8-45ad-9f28-9a6883032c4e
Uploading an estimated of 1 files
Uploading ./data/Heatstroke_patient_prediction_train_data.csv
Uploaded ./data/Heatstroke_patient_prediction_train_data.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_workspaceblobstore

## 3. Automated ML 機能を使って予測モデルを作成

### モデル学習用スクリプト作成

In [6]:
%%writefile $project_folder/get_data.py

import numpy as np
from azure.storage.blob import BlockBlobService
import pandas as pd
import os.path

def get_data():
    file_name = "Heatstroke_patient_prediction_train_data.csv"
    if not os.path.exists('./' + file_name) :
        account_name='azmls1405322897'
        account_key='0oGnArgZlzIs+YvO8bUoWFH61GXexPTFJoCWshh9xhdbGnLT0Khk6wZ5OpuZDkRe/BFDddcBK2rhVtkcTLnFHw=='
        container_name='azureml-blobstore-5e66db71-88c8-45ad-9f28-9a6883032c4e'
        blob_name=file_name
        
        service = BlockBlobService(account_name=account_name, account_key=account_key)
        service.get_blob_to_path(container_name, blob_name, file_name)
    
    df = pd.read_csv('./' + file_name)
    X_train = df.drop(columns=["年月日","年","月","搬送人員（計）"],axis=1)
    y_df = df["搬送人員（計）"]
    y_train = y_df.values

    return { "X" : X_train, "y" : y_train }

Overwriting ./sample_projects/Heatstroke_patient_prediction/get_data.py


### モデル学習実行環境定義

In [7]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.automl import AutoMLConfig
import logging

run_config = RunConfiguration(framework="python")

run_config.target = cluster.name

run_config.environment.docker.enabled = True
run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE

run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['numpy','pandas'], 
    pip_packages=['azureml-sdk[automl]', 'azure-storage'])


automated_ml_config = AutoMLConfig(task = 'regression',
                             primary_metric = 'normalized_mean_absolute_error',
                             iteration_timeout_minutes = 10,
                             iterations = 30,
                             preprocess = True,
                             verbosity = logging.INFO,
                             experiment_exit_score = 0,
                             path = project_folder,
                             data_script = project_folder + "/get_data.py",
                             run_configuration=run_config,
                             debug_log = 'automated_ml_errors.log',
                             n_cross_validations = 3,
                             max_concurrent_iterations = 4)

### Azure上に学習環境を構築

In [8]:
run_automl = exp.submit(automated_ml_config, show_output = False)
run_automl

Experiment,Id,Type,Status,Details Page,Docs Page
Heatstroke_patient_prediction,AutoML_2c1fff4b-703f-4351-bc91-a81a2d203f92,automl,Starting,Link to Azure Portal,Link to Documentation


## 4. 学習を実行

In [9]:
from azureml.widgets import RunDetails
RunDetails(run_automl).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [10]:
%%time
# Shows output of the run on stdout.
run_automl.wait_for_completion(show_output=True)


****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION   PIPELINE                                       DURATION      METRIC      BEST
         3   StandardScalerWrapper ExtremeRandomTrees       0:01:03       0.0271    0.0271
         2   StandardScalerWrapper RandomForest             0:02:57       0.0359    0.0271
         1   StandardScalerWrapper ElasticNet               0:03:50       0.0388    0.0271
         0   StandardScalerWrapper ElasticNet               0:02:00       0.0375    0.0271
         7   MinMaxScaler RandomForest                      0:02:37       0.0264    

{'runId': 'AutoML_2c1fff4b-703f-4351-bc91-a81a2d203f92',
 'target': 'cpucluster',
 'status': 'Completed',
 'startTimeUtc': '2019-09-27T04:26:26.679921Z',
 'endTimeUtc': '2019-09-27T04:41:02.476043Z',
 'properties': {'num_iterations': '30',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'normalized_mean_absolute_error',
  'train_split': '0',
  'MaxTimeSeconds': '600',
  'acquisition_parameter': '0',
  'num_cross_validation': '3',
  'target': 'cpucluster',
  'DataPrepJsonString': None,
  'EnableSubsampling': 'False',
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'regression',
  'dependencies_versions': '{"azureml-widgets": "1.0.57", "azureml-train": "1.0.57", "azureml-train-restclients-hyperdrive": "1.0.57", "azureml-train-core": "1.0.57", "azureml-train-automl": "1.0.57", "azureml-telemetry": "1.0.57", "azureml-sdk": "1.0.57", "azureml-pipeline": "1.0.57", "azureml-pipeline-steps": "1.0.57", "azureml-pipeline-core

In [11]:
children = list(run_automl.get_children())
metricslist = {}
for run in children:
    properties = run.get_properties()
    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}
    metricslist[int(properties['iteration'])] = metrics

rundata = pd.DataFrame(metricslist).sort_index(1)
rundata

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
explained_variance,0.8,0.79,0.75,0.86,0.77,0.81,0.83,0.87,0.79,0.74,...,0.86,0.8,0.79,0.79,0.76,0.82,0.78,0.85,0.88,0.88
mean_absolute_error,11.37,11.76,10.87,8.22,9.96,9.44,10.82,8.0,11.61,10.74,...,8.05,11.39,11.7,11.7,10.63,9.2,11.76,8.4,7.47,7.91
mean_absolute_percentage_error,176.84,179.92,98.58,65.51,71.3,71.42,165.89,61.33,173.47,79.33,...,61.91,177.38,181.74,178.5,92.58,72.58,170.77,69.55,61.26,75.67
median_absolute_error,7.88,8.17,5.07,3.63,4.1,4.01,7.47,3.54,7.8,4.42,...,3.48,7.99,8.01,8.14,4.63,4.03,8.04,3.75,3.2,3.81
normalized_mean_absolute_error,0.04,0.04,0.04,0.03,0.03,0.03,0.04,0.03,0.04,0.04,...,0.03,0.04,0.04,0.04,0.04,0.03,0.04,0.03,0.02,0.03
normalized_median_absolute_error,0.03,0.03,0.02,0.01,0.01,0.01,0.02,0.01,0.03,0.01,...,0.01,0.03,0.03,0.03,0.02,0.01,0.03,0.01,0.01,0.01
normalized_root_mean_squared_error,0.06,0.06,0.07,0.05,0.06,0.06,0.05,0.05,0.06,0.07,...,0.05,0.06,0.06,0.06,0.06,0.06,0.06,0.05,0.05,0.05
normalized_root_mean_squared_log_error,,,0.13,0.11,0.12,0.11,,0.1,,0.13,...,0.1,,,,0.13,0.11,,0.11,,
r2_score,0.8,0.79,0.75,0.86,0.77,0.81,0.83,0.87,0.79,0.74,...,0.86,0.8,0.79,0.79,0.76,0.82,0.78,0.85,0.88,0.88
root_mean_squared_error,17.65,18.27,20.1,15.02,18.9,17.29,16.52,14.61,18.22,19.74,...,14.72,17.68,18.22,18.21,19.67,16.97,18.59,15.31,13.9,14.0


## 5. 学習済みモデルの登録

In [12]:
best_run, fitted_model = run_automl.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: Heatstroke_patient_prediction,
Id: AutoML_2c1fff4b-703f-4351-bc91-a81a2d203f92_28,
Type: azureml.scriptrun,
Status: Completed)
RegressionPipeline(pipeline=Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(enable_feature_sweeping=None, feature_sweeping_timeout=None,
        is_onnx_compatible=None, logger=None, observer=None, task=None)), ('prefittedsoftvotingregressor', PreFittedSoftVotingRegressor(estimators=[('7', Pipeline(memory=None,
     steps=[('minmax...333333333333, 0.06666666666666667, 0.13333333333333333, 0.06666666666666667, 0.06666666666666667]))]),
          stddev=None)


In [14]:
description = 'Heatstroke patient prediction Model'
name = "heatstroke_model"
model = best_run.register_model(model_name=name, description=description, model_path = "outputs/model.pkl")
print(model.name, model.id, model.version, sep='\t')

heatstroke_model	heatstroke_model:3	3
