# Final project - Udacity

In [3]:
%%writefile train.py
from sklearn.linear_model import LogisticRegression
import argparse
import os
import numpy as np
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from azureml.core import Dataset
from azureml.data.dataset_factory import TabularDatasetFactory, DataType

run = Run.get_context()

def clean_data(df):
    # Filter out games with no descriptors
    df = df[df["no_descriptors"] == 0]
    df = df.drop("no_descriptors", axis=1)
    
    x_df = df.drop(['title', 'console', 'esrb_rating', 'esrb_rating'], axis=1)    
    y_df = df["esrb_rating"]
    return x_df, y_df


ws = run.experiment.workspace
datastore = ws.get_default_datastore()
ds = Dataset.Tabular.from_delimited_files(path = [(datastore, ('data/videogame.csv'))]).to_pandas_dataframe()

x, y = clean_data(ds)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=1000, help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))

    print("Exporting the model as pickle file...")
    outputs_folder = './outputs'
    os.makedirs(outputs_folder, exist_ok=True)

    model_filename = "model.pkl"
    model_path = os.path.join(outputs_folder, model_filename)
    joblib.dump(model, model_path)
    
if __name__ == '__main__':
    main()    

Overwriting train.py


# Project

In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="udacity-proj")
exp = Experiment(workspace=ws, name="final-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: udacity-proj
Azure region: brazilsouth
Subscription id: c7d37146-6800-47a5-90fa-d4c5514db228
Resource group: rg-udacity


# Register Dataset

In [2]:
from azureml.core import Workspace, Dataset
datastore = ws.get_default_datastore()

# get the datastore to upload prepared data
datastore = ws.get_default_datastore()

# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='data', target_path='data')

# create a dataset referencing the cloud location
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, ('data/videogame.csv'))])

Uploading an estimated of 1 files
Target already exists. Skipping upload for data/videogame.csv
Uploaded 0 files


# Create cluster

In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "train-cluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS3_v2',
                                                           max_nodes=2)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [6]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
#from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive import uniform, choice
from azureml.core import ScriptRunConfig
from azureml.train.estimator import Estimator
from shutil import copyfile

import os

# Specify parameter sampler
ps = RandomParameterSampling({    
    "--C": uniform(1.0, 2.0),
    "--max_iter": choice(1000,3000,5000)
})


# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor = 0.1)


if "training" not in os.listdir():
    os.mkdir("./training")
copyfile('train.py', "./training/train.py")
    
# Create a SKLearn estimator for use with train.py
est = Estimator(source_directory = 'training',
                conda_packages=['scikit-learn'],
                entry_script = 'train.py',
                compute_target=cpu_cluster)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hdc = HyperDriveConfig(estimator = est,
                       hyperparameter_sampling = ps,
                       policy = policy,
                       primary_metric_name = 'Accuracy',
                       primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                       max_total_runs=40,
                       max_concurrent_runs=4)

'Estimator' is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or an Azure ML curated environment.


In [7]:
hdr = exp.submit(config=hdc)



In [8]:
RunDetails(hdr).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

# Best model

In [47]:
import joblib
# Get your best run and save the model from that run.
best_run = hdr.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameters_values = best_run.get_details()['runDefinition']['arguments']

print("Best run id: " , best_run.id),
print("Accuracy : " , best_run.get_metrics()['Accuracy']),
print("Params : " , parameters_values)

Best run id:  HD_0e7e2589-f199-45eb-ac5a-9c8475f4b8f9_2
Accuracy :  0.8193069306930693
Params :  ['--C', '1.0658189926584296', '--max_iter', '3000']


# Automated ML

In [None]:
#ds = pd.read_csv('videogame.csv')

In [10]:
datastore = ws.get_default_datastore()
ds = Dataset.Tabular.from_delimited_files(path = [(datastore, ('data/videogame.csv'))]).to_pandas_dataframe()

In [11]:
def clean_data(df):
    # Filter out games with no descriptors
    df = df[df["no_descriptors"] == 0]
    df = df.drop("no_descriptors", axis=1)
    
    x_df = df.drop(['title', 'console', 'esrb_rating', 'esrb_rating'], axis=1)    
    y_df = df["esrb_rating"]
    return x_df, y_df

In [12]:
x, y = clean_data(ds)

In [13]:
x['y'] = y.values

In [14]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
automl_config = AutoMLConfig(
    experiment_timeout_minutes=60,
    task="classification",
    primary_metric="accuracy",
    training_data= x,
    label_column_name='y',
    n_cross_validations=5)

In [15]:
#submit automl run
from azureml.core.experiment import Experiment

experiment = Experiment(ws, "final-project-automl")
run_auto = experiment.submit(config=automl_config, show_output=True)

No run_configuration provided, running on local with default configuration
Running on local machine
Parent Run ID: AutoML_083e61e1-db1c-4a36-b268-8331be7f8cfd

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

**************************************************

        64   StandardScalerWrapper LogisticRegression       0:00:29       0.8209    0.8557
        65   StandardScalerWrapper XGBoostClassifier        0:00:54       0.8542    0.8557
        66   StandardScalerWrapper XGBoostClassifier        0:00:39       0.8304    0.8557
        67   StandardScalerWrapper LogisticRegression       0:00:28       0.8358    0.8557
        68   SparseNormalizer XGBoostClassifier             0:00:32       0.8274    0.8557
        69   StandardScalerWrapper XGBoostClassifier        0:00:31       0.8284    0.8557
        70   SparseNormalizer XGBoostClassifier             0:00:39       0.8368    0.8557
        71   MaxAbsScaler LogisticRegression                0:00:28       0.8338    0.8557
        72   StandardScalerWrapper XGBoostClassifier        0:00:55       0.8427    0.8557
        73   StandardScalerWrapper XGBoostClassifier        0:00:33       0.8457    0.8557
        74   StandardScalerWrapper XGBoostClassifier        0:01:01       0.8527    0.8557

In [46]:
RunDetails(run_auto).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

# Deploy the best model

In [17]:
best_run_automl = run_auto.get_best_child()

In [51]:
best_run_automl

Experiment,Id,Type,Status,Details Page,Docs Page
final-project-automl,AutoML_083e61e1-db1c-4a36-b268-8331be7f8cfd_96,,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [None]:
import joblib
# Get your best run and save the model from that run.
best_run = hdr.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameters_values = best_run.get_details()['runDefinition']['arguments']

print("Best run id: " , best_run.id),
print("Accuracy : " , best_run.get_metrics()['Accuracy']),
print("Params : " , parameters_values)

In [24]:
best_run_automl.download_file(name='outputs/conda_env_v_1_0_0.yml', output_file_path='best_model/conda_env_v_1_0_0.yml')
best_run_automl.download_file(name='outputs/scoring_file_v_1_0_0.py', output_file_path='best_model/score.py')

In [19]:
from azureml.core.environment import Environment

In [22]:
myenv = Environment.from_conda_specification('env', 'best_model/conda_env_v_1_0_0.yml')

In [31]:
from azureml.core.model import InferenceConfig, Model
from azureml.core.webservice import AciWebservice, Webservice

# Register the model to deploy
#model = run.register_model(model_name = "mymodel", model_path = "outputs/model.pkl")
model = best_run.register_model(model_name='outoml', model_framework='automl', model_path='outputs/model.pkl')

# Combine scoring script & environment in Inference configuration
inference_config = InferenceConfig(entry_script="best_model/score.py",
                                   environment=myenv)

# Set deployment configuration
deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1,
                                                       memory_gb = 1)

# Define the model, inference, & deployment configuration and web service name and location to deploy
service = Model.deploy(workspace = ws,
                       name = "my-automl-service",
                       models = [model],
                       inference_config = inference_config,
                       deployment_config = deployment_config)

# Endpoint is active

In [56]:
service

AciWebservice(workspace=Workspace.create(name='udacity-proj', subscription_id='c7d37146-6800-47a5-90fa-d4c5514db228', resource_group='rg-udacity'), name=my-automl-service, image_id=None, compute_type=None, state=ACI, scoring_uri=Healthy, tags=http://b6574cd6-6b08-4e37-bd7b-81d228a0f0c9.brazilsouth.azurecontainer.io/score, properties={}, created_by={'hasInferenceSchema': 'True', 'hasHttps': 'False'})

# Test Infer

In [34]:
import pandas as pd
df_infer = pd.DataFrame({"alcohol_reference": pd.Series([0], dtype="int64"), "animated_blood": pd.Series([0], dtype="int64"), "blood": pd.Series([0], dtype="int64"), "blood_and_gore": pd.Series([0], dtype="int64"), "cartoon_violence": pd.Series([0], dtype="int64"), "crude_humor": pd.Series([0], dtype="int64"), "drug_reference": pd.Series([0], dtype="int64"), "fantasy_violence": pd.Series([0], dtype="int64"), "intense_violence": pd.Series([0], dtype="int64"), "language": pd.Series([0], dtype="int64"), "lyrics": pd.Series([0], dtype="int64"), "mature_humor": pd.Series([0], dtype="int64"), "mild_blood": pd.Series([0], dtype="int64"), "mild_cartoon_violence": pd.Series([0], dtype="int64"), "mild_fantasy_violence": pd.Series([0], dtype="int64"), "mild_language": pd.Series([0], dtype="int64"), "mild_lyrics": pd.Series([0], dtype="int64"), "mild_suggestive_themes": pd.Series([0], dtype="int64"), "mild_violence": pd.Series([0], dtype="int64"), "nudity": pd.Series([0], dtype="int64"), "partial_nudity": pd.Series([0], dtype="int64"), "sexual_content": pd.Series([0], dtype="int64"), "sexual_themes": pd.Series([0], dtype="int64"), "simulated_gambling": pd.Series([0], dtype="int64"), "strong_janguage": pd.Series([0], dtype="int64"), "strong_sexual_content": pd.Series([0], dtype="int64"), "suggestive_themes": pd.Series([0], dtype="int64"), "use_of_alcohol": pd.Series([0], dtype="int64"), "use_of_drugs_and_alcohol": pd.Series([0], dtype="int64"), "violence": pd.Series([0], dtype="int64")})

In [39]:
input_data = "{\"data\": " + df_infer.to_json(orient='records') + "}"

In [42]:
import requests
headers = {'Content-Type': 'application/json'}

scoring_uri = "http://b6574cd6-6b08-4e37-bd7b-81d228a0f0c9.brazilsouth.azurecontainer.io/score"
resp = requests.post(scoring_uri, input_data, headers=headers)

print("prediction:", resp.text)

prediction: "{\"result\": [\"E\"]}"


# Enable App Insights

In [43]:
from azureml.core import Workspace
from azureml.core.webservice import Webservice

# Requires the config to be downloaded first to the current working directory
ws = Workspace.from_config()

# Set with the deployment name
name = "my-automl-service"

# load existing web service
service = Webservice(name=name, workspace=ws)

# enable application insight
service.update(enable_app_insights=True)

logs = service.get_logs()

for line in logs.split('\n'):
    print(line)

2021-01-31T15:38:41,981755419+00:00 - rsyslog/run 
2021-01-31T15:38:41,985776005+00:00 - iot-server/run 
2021-01-31T15:38:41,988859794+00:00 - gunicorn/run 
rsyslogd: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libuuid.so.1: no version information available (required by rsyslogd)
2021-01-31T15:38:42,008645425+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml