After creating configuration notebook..

#### Prepare data for regression modeling


In [None]:
# use open dataset to get the dataset
# pip install azureml-opendatasets

In [None]:
import azureml.core
print("SDK version is :" , azureml.core.VERSION)



In [None]:
# load data

from azureml.opendatasets import NycTlcGreen, NycTlcYellow
#The Open Datasets package contains a class representing each data source (NycTlcGreen and NycTlcYellow) to easily filter date parameters before downloading.

import pandas as pd
from datetime import datetime
from dateutil.relativedata import relativedata

green_df_raw = pd.DataFrame([]) # init a dataframe for appending data which is downloaded
# take one month of data
start = datetime.strptime("1/1/2016","%m/%d/%Y")
end = datetime.strptime("1/31/2016","%m/%d/%Y")

# number of months taken is 1
number_of_months = 1
sample_size = 5000 # data taken out of 5000 samples; to avoid memory issues

for sample_month in range(number_of_months):
    temp_df_green = NycTlcGreen(start + relativedelta(months=sample_month), end + relativedelta(months=sample_month)) \
        .to_pandas_dataframe()
    green_df_raw = green_df_raw.append(temp_df_green.sample(sample_size)) 

In [None]:
yellow_df_raw = pd.DataFrame([])
start = datetime.strptime("1/1/2016","%m/%d/%Y")
end = datetime.strptime("1/31/2016","%m/%d/%Y")

sample_size = 500

for sample_month in range(number_of_months):
    temp_df_yellow = NycTlcYellow(start + relativedelta(months=sample_month), end + relativedelta(months=sample_month)) \
        .to_pandas_dataframe()
    yellow_df_raw = yellow_df_raw.append(temp_df_yellow.sample(sample_size))

In [None]:
# display data

from IPython.display import display

display(green_df_raw.head(20))
display(yellow_df_raw.head(20))

Download data locally and then upload to Azure Blob.to save the dave in the default datastore

In [None]:
import os
dataDir = "data"

if not os.path.exists(dataDir):
    os.mkdir(dataDir)
greenDir = dataDir + "/green"
yellowDir = dataDir + "/yellow"

if not os.path.exists(greenDir):
    os.mkdir(greenDir)
    
if not os.path.exists(yelloDir):
    os.mkdir(yelloDir)
    
greenTaxiData = greenDir + "/unprepared.parquet"
yellowTaxiData = yelloDir + "/unprepared.parquet"

green_df_raw.to_csv(greenTaxiData, index=False)
yellow_df_raw.to_csv(yellowTaxiData, index=False)

print("Data written to local folder.")



Create workspace and datastore

In [None]:

from azureml.core import Workspace

ws = Workspace.from_config()
print("Workspace: " + ws.name, "Region: " + ws.location, sep = '\n')

# Default datastore
default_store = ws.get_default_datastore() 

default_store.upload_files([greenTaxiData], 
                           target_path = 'green', 
                           overwrite = True, 
                           show_progress = True)

default_store.upload_files([yellowTaxiData], 
                           target_path = 'yellow', 
                           overwrite = True, 
                           show_progress = True)

print("Upload calls completed.")

Create dataset

By creating a dataset, you create a reference to the data source location. If you applied any subsetting transformations to the dataset, they will be stored in the dataset as well.

In [None]:
from azureml.core import Dataset
green_taxi_data = Dataset.Tabular.from_delimited_files(default_store.path('green/unprepared.parquet'))
yellow_taxi_data = Dataset.Tabular.from_delimited_files(default_store.path('yellow/unprepared.parquet'))

Register dataset

Register the taxi datasets with the workspace so that you can reuse them in other experiments.

In [None]:
green_taxi_data = green_taxi_data.register(ws, 'green_taxi_data')
yellow_taxi_data = yellow_taxi_data.register(ws, 'yellow_taxi_data')

Create Compute target

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
amlcompute_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    aml_compute = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',
                                                           max_nodes=4)
    aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

aml_compute.wait_for_completion(show_output=True)

Define RunConfig for the compute
This is to load all dependencies as well.

In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Create a new runconfig object
aml_run_config = RunConfiguration()

# Use the aml_compute you created above. 
aml_run_config.target = aml_compute

# Enable Docker
aml_run_config.environment.docker.enabled = True

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
aml_run_config.environment.python.user_managed_dependencies = False

# Specify CondaDependencies obj, add necessary packages
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn'], 
    pip_packages=['azureml-sdk[automl]', 'pyarrow'])

print ("Run configuration created.")

Prepare data
Using pandas,run data transformations to combine the above created 2 datasets (yellow and green)

creating a separate step for each transformation as this allows us to reuse the steps and saves us from running all over again in case of any change. We will keep data preparation scripts in one subfolder and training scripts in another.

In [None]:
# Defining columns

display(green_df_raw.columns)
display(yellow_df_raw.columns)

# useful columns needed for the Azure Machine Learning NYC Taxi tutorial
useful_columns = str(["cost", "distance", "dropoff_datetime", "dropoff_latitude", 
                      "dropoff_longitude", "passengers", "pickup_datetime", 
                      "pickup_latitude", "pickup_longitude", "store_forward", "vendor"]).replace(",", ";")

print("Useful columns defined.")

Cleanse Green taxi data

In [None]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

# python scripts folder
prepare_data_folder = './scripts/prepdata'

# rename columns
green_columns = str({ 
    "vendorID": "vendor",
    "lpepPickupDatetime": "pickup_datetime",
    "lpepDropoffDatetime": "dropoff_datetime",
    "storeAndFwdFlag": "store_forward",
    "pickupLongitude": "pickup_longitude",
    "pickupLatitude": "pickup_latitude",
    "dropoffLongitude": "dropoff_longitude",
    "dropoffLatitude": "dropoff_latitude",
    "passengerCount": "passengers",
    "fareAmount": "cost",
    "tripDistance": "distance"
}).replace(",", ";")

# Define output after cleansing step
cleansed_green_data = PipelineData("cleansed_green_data", datastore=default_store).as_dataset()

print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# cleansing step creation
# See the cleanse.py for details about input and output
cleansingStepGreen = PythonScriptStep(
    name="Cleanse Green Taxi Data",
    script_name="cleanse.py", 
    arguments=["--useful_columns", useful_columns,
               "--columns", green_columns,
               "--output_cleanse", cleansed_green_data],
    inputs=[green_taxi_data.as_named_input('raw_data')],
    outputs=[cleansed_green_data],
    compute_target=aml_compute,
    runconfig=aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("cleansingStepGreen created.")

Cleanse Yellow taxi data

In [None]:
yellow_columns = str({
    "vendorID": "vendor",
    "tpepPickupDateTime": "pickup_datetime",
    "tpepDropoffDateTime": "dropoff_datetime",
    "storeAndFwdFlag": "store_forward",
    "startLon": "pickup_longitude",
    "startLat": "pickup_latitude",
    "endLon": "dropoff_longitude",
    "endLat": "dropoff_latitude",
    "passengerCount": "passengers",
    "fareAmount": "cost",
    "tripDistance": "distance"
}).replace(",", ";")

# Define output after cleansing step
cleansed_yellow_data = PipelineData("cleansed_yellow_data", datastore=default_store).as_dataset()

print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# cleansing step creation
# See the cleanse.py for details about input and output
cleansingStepYellow = PythonScriptStep(
    name="Cleanse Yellow Taxi Data",
    script_name="cleanse.py", 
    arguments=["--useful_columns", useful_columns,
               "--columns", yellow_columns,
               "--output_cleanse", cleansed_yellow_data],
    inputs=[yellow_taxi_data.as_named_input('raw_data')],
    outputs=[cleansed_yellow_data],
    compute_target=aml_compute,
    runconfig=aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("cleansingStepYellow created.")

### Merge cleansed Green and Yellow datasets

In [None]:
# Define output after merging step
merged_data = PipelineData("merged_data", datastore=default_store).as_dataset()

print('Merge script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# merging step creation
# See the merge.py for details about input and output
mergingStep = PythonScriptStep(
    name="Merge Taxi Data",
    script_name="merge.py", 
    arguments=["--output_merge", merged_data],
    inputs=[cleansed_green_data.parse_parquet_files(),
            cleansed_yellow_data.parse_parquet_files()],
    outputs=[merged_data],
    compute_target=aml_compute,
    runconfig=aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("mergingStep created.")

**Filter data**
This step filters out coordinates for locations that are outside the city border. We use a TypeConverter object to change the latitude and longitude fields to decimal type.

In [None]:
# Define output after merging step
filtered_data = PipelineData("filtered_data", datastore=default_store).as_dataset()

print('Filter script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# filter step creation
# See the filter.py for details about input and output
filterStep = PythonScriptStep(
    name="Filter Taxi Data",
    script_name="filter.py", 
    arguments=["--output_filter", filtered_data],
    inputs=[merged_data.parse_parquet_files()],
    outputs=[filtered_data],
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("FilterStep created.")

**Normalize data**
In this step, we split the pickup and dropoff datetime values into the respective date and time columns and then we rename the columns to use meaningful names.

In [None]:
# Define output after normalize step
normalized_data = PipelineData("normalized_data", datastore=default_store).as_dataset()

print('Normalize script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# normalize step creation
# See the normalize.py for details about input and output
normalizeStep = PythonScriptStep(
    name="Normalize Taxi Data",
    script_name="normalize.py", 
    arguments=["--output_normalize", normalized_data],
    inputs=[filtered_data.parse_parquet_files()],
    outputs=[normalized_data],
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("normalizeStep created.")

**Transform data**

Split the pickup and dropoff date further into the day of the week, day of the month, and month values.
To get the day of the week value, uses the derive_column_by_example() function. The function takes an array parameter of example objects that define the input data, and the preferred output. The function automatically determines the preferred transformation. For the pickup and dropoff time columns, split the time into the hour, minute, and second by using the split_column_by_example() function with no example parameter.
After new features are generated, use the drop_columns() function to delete the original fields as the newly generated features are preferred.
Rename the rest of the fields to use meaningful descriptions.

In [None]:
# Define output after transform step
transformed_data = PipelineData("transformed_data", datastore=default_store).as_dataset()

print('Transform script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# transform step creation
# See the transform.py for details about input and output
transformStep = PythonScriptStep(
    name="Transform Taxi Data",
    script_name="transform.py", 
    arguments=["--output_transform", transformed_data],
    inputs=[normalized_data.parse_parquet_files()],
    outputs=[transformed_data],
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("transformStep created.")

Split the data into train and test sets

In [None]:
train_model_folder = './scripts/trainmodel'

# train and test splits output
output_split_train = PipelineData("output_split_train", datastore=default_store).as_dataset()
output_split_test = PipelineData("output_split_test", datastore=default_store).as_dataset()

print('Data spilt script is in {}.'.format(os.path.realpath(train_model_folder)))

# test train split step creation
# See the train_test_split.py for details about input and output
testTrainSplitStep = PythonScriptStep(
    name="Train Test Data Split",
    script_name="train_test_split.py", 
    arguments=["--output_split_train", output_split_train,
               "--output_split_test", output_split_test],
    inputs=[transformed_data.parse_parquet_files()],
    outputs=[output_split_train, output_split_test],
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=train_model_folder,
    allow_reuse=True
)

print("testTrainSplitStep created.")

This completes the datapreparation part. 
Next is model training. For this we are using AutoML to  train the model with different algorithms ( voting and stacking).

For automl using automl step class.

**pip install azureml -sdk[automl]**

### Automatically train a model

Step 1: Create experiment

In [None]:
from azureml.core import Experiment

experiment = Experiment(ws, 'NYCTaxi_Tutorial_Pipelines')

print("Experiment created")

Step 2 : Define settings for autogeneration and tuning

In [None]:
# specify your training data and the type of model

from azureml.train.automl import AutoMLConfig

automl_settings = {
    "iteration_timeout_minutes" : 10,
    "iterations" : 25,
    "primary_metric" : 'spearman_correlation',
    "n_cross_validations": 5
}

training_dataset = output_split_train.parse_parquet_files().keep_columns(['pickup_weekday','pickup_hour', 'distance','passengers', 'vendor', 'cost'])

automl_config = AutoMLConfig(task = 'regression',
                             debug_log = 'automated_ml_errors.log',
                             path = train_model_folder,
                             compute_target = aml_compute,
                             featurization = 'auto',
                             training_data = training_dataset,
                             label_column_name = 'cost',
                             **automl_settings)
                             
print("AutoML config created.")

Step 3 : Define AutoMLStep

In [None]:
from azureml.pipeline.steps import AutoMLStep

trainWithAutomlStep = AutoMLStep(name='AutoML_Regression',
                                 automl_config=automl_config,
                                 allow_reuse=True)
print("trainWithAutomlStep created.")

Step 4 : Build and run the pipeline

In [None]:
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

pipeline_steps = [trainWithAutomlStep]

pipeline = Pipeline(workspace = ws, steps=pipeline_steps)
print("Pipeline is built.")

pipeline_run = experiment.submit(pipeline, regenerate_outputs=False)

print("Pipeline submitted for execution.")

In [None]:
# Show pipeline progress
RunDetails(pipeline_run).show()

Step 5: Explore the results

In [None]:
# Before we proceed we need to wait for the run to complete.
pipeline_run.wait_for_completion(show_output=False)

# functions to download output to local and fetch as dataframe
def get_download_path(download_path, output_name):
    output_folder = os.listdir(download_path + '/azureml')[0]
    path =  download_path + '/azureml/' + output_folder + '/' + output_name
    return path

def fetch_df(current_step, output_name):
    output_data = current_step.get_output_data(output_name)    
    download_path = './outputs/' + output_name
    output_data.download(download_path, overwrite=True)
    df_path = get_download_path(download_path, output_name) + '/processed.parquet'
    return pd.read_parquet(df_path)

Step 6: View cleansed taxi data

In [None]:
green_cleanse_step = pipeline_run.find_step_run(cleansingStepGreen.name)[0]
yellow_cleanse_step = pipeline_run.find_step_run(cleansingStepYellow.name)[0]

cleansed_green_df = fetch_df(green_cleanse_step, cleansed_green_data.name)
cleansed_yellow_df = fetch_df(yellow_cleanse_step, cleansed_yellow_data.name)

display(cleansed_green_df.head(5))
display(cleansed_yellow_df.head(5))

Step 7: View combined taxi data profile

In [None]:
merge_step = pipeline_run.find_step_run(mergingStep.name)[0]
combined_df = fetch_df(merge_step, merged_data.name)

display(combined_df.describe())

In [None]:
# view filtered taxi data
filter_step = pipeline_run.find_step_run(filterStep.name)[0]
filtered_df = fetch_df(filter_step, filtered_data.name)

display(filtered_df.describe())

In [None]:
# view normalized taxi data

normalize_step = pipeline_run.find_step_run(normalizeStep.name)[0]
normalized_df = fetch_df(normalize_step, normalized_data.name)

display(normalized_df.head(5))

In [None]:
# View transformed taxi data

transform_step = pipeline_run.find_step_run(transformStep.name)[0]
transformed_df = fetch_df(transform_step, transformed_data.name)

display(transformed_df.describe())
display(transformed_df.head(5))

Step 8 : View training data used by AutoML


In [None]:
split_step = pipeline_run.find_step_run(testTrainSplitStep.name)[0]
train_split = fetch_df(split_step, output_split_train.name)

display(train_split.describe())
display(train_split.head(5))

Step 9 : View the details of the AutoML run

In [None]:
from azureml.train.automl.run import AutoMLRun
#from azureml.widgets import RunDetails

# workaround to get the automl run as its the last step in the pipeline 
# and get_steps() returns the steps from latest to first

for step in pipeline_run.get_steps():
    automl_step_run_id = step.id
    print(step.name)
    print(automl_step_run_id)
    break

automl_run = AutoMLRun(experiment = experiment, run_id=automl_step_run_id)
#RunDetails(automl_run).show()

Step 10 : Get the best model

In [None]:
best_run, fitted_model = automl_run.get_output()
print(best_run)
print(fitted_model)

### Test the model


In [None]:
# Get test data 

split_step = pipeline_run.find_step_run(testTrainSplitStep.name)[0]

x_test = fetch_df(split_step, output_split_test.name)[['distance','passengers', 'vendor','pickup_weekday','pickup_hour']]
y_test = fetch_df(split_step, output_split_test.name)[['cost']]

display(x_test.head(5))
display(y_test.head(5))


Test the best fitted model

In [None]:
y_predict = fitted_model.predict(x_test)
y_actual =  y_test.values.tolist()
display(pd.DataFrame({'Actual':y_actual, 'Predicted':y_predict}).head(5))

In [None]:
# Plot

import matplotlib.pyplot as plt

fig = plt.figure(figsize=(14, 10))
ax1 = fig.add_subplot(111)

distance_vals = [x[0] for x in x_test.values]

ax1.scatter(distance_vals[:100], y_predict[:100], s=18, c='b', marker="s", label='Predicted')
ax1.scatter(distance_vals[:100], y_actual[:100], s=18, c='r', marker="o", label='Actual')
ax1.set_xlabel('distance (mi)')
ax1.set_title('Predicted and Actual Cost/Distance')
ax1.set_ylabel('Cost ($)')

plt.legend(loc='upper left', prop={'size': 12})
plt.rcParams.update({'font.size': 14})
plt.show()