# Capstone project 2
## Image classification model
### Cifar-10 data - densenet161 pretrained model - tuning & training

In [2]:
%%capture
# TODO: Install any packages that you might need
# For instance, you will need the smdebug package
! pip install smdebug
! pip install torch --no-cache-dir
! pip install torchvision

In [3]:
# Import packages 
from __future__ import print_function, division
import sagemaker
import boto3
import os
from sagemaker.pytorch import PyTorch
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import copy

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

plt.ion()   # interactive mode
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Dataset
#### Folder data will be copy to s3 bucket for training.

In [4]:
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = "us-east-1"
# bucket = 'img-ngandn18'
data_bucket = 'cap2-ngandn18'
bucket = sagemaker_session.default_bucket()
os.environ["DEFAULT_S3_BUCKET"] = bucket
# datapath = 'minidogs'
datapath = 'data'
print(sagemaker_session, bucket, region, datapath)

<sagemaker.session.Session object at 0x7f75a286e9d0> sagemaker-us-east-1-503563512855 us-east-1 data


In [6]:
# Upload data to S3 bucket.
inputs = sagemaker_session.upload_data(path=datapath, bucket=data_bucket, key_prefix=datapath)
print(inputs)

s3://cap2-ngandn18/data


## Hyperparameter Tuning
**TODO:** This is the part where you will finetune a pretrained model with hyperparameter tuning. Remember that you have to tune a minimum of two hyperparameters. However you are encouraged to tune more. You are also encouraged to explain why you chose to tune those particular hyperparameters and the ranges.

**Note:** You will need to use the `hpo.py` script to perform hyperparameter tuning.

In [7]:
# Declare HP ranges, metrics etc.
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)

hyperparameter_ranges = {
    "lr": ContinuousParameter(0.0004, 0.001),
    "batch_size": CategoricalParameter([16, 32, 64])
}

In [8]:
# Create estimators
instance_type='ml.g4dn.4xlarge'
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
    entry_point="hpo.py",
    role=role,
    py_version='py38',
    framework_version="1.9",
    instance_count=1,
    instance_type=instance_type
    )

objective_metric_name = "average test loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "average test loss", 
                       "Regex": "Test set: Average loss: ([0-9\\.]+)"}]

tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=4,
    max_parallel_jobs=2,
    objective_type=objective_type,
)

In [5]:
# S3 paths for the training job
s3_data_dir = "s3://{}/{}/".format(data_bucket, datapath)
s3_output_dir = "s3://{}/{}/".format(bucket, "output")
s3_model_dir = "s3://{}/{}/".format(bucket, "model")

os.environ['SM_CHANNEL_TRAIN']= s3_data_dir
os.environ['SM_OUTPUT_DATA_DIR']= s3_output_dir
os.environ['SM_MODEL_DIR']= s3_model_dir
print(s3_data_dir)


s3://cap2-ngandn18/data/


In [None]:
# TODO: Fit your HP Tuner
# tuner.fit() # TODO: Remember to include your data channels
# Fit the HP Tuner with input data channel
tuner.fit({'train' : s3_data_dir}, wait=True) 


........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [None]:
# TODO: Get the best estimators and the best HPs
# best_estimator = #TODO
best_estimator = tuner.best_estimator()

#Get the hyperparameters of the best trained model
best_hypers = best_estimator.hyperparameters()
# To know exact data from best_estimator.hyperparameters()
print(best_hypers)

## Model Profiling and Debugging
TODO: Using the best hyperparameters, create and finetune a new model

**Note:** You will need to use the `train_model.py` script to perform model profiling and debugging.

In [6]:
# TODO: Set up debugging and profiling rules and hooks
# Setting up debugging and profiling rules and hooks
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.debugger import (
    Rule,
    DebuggerHookConfig,
    rule_configs,
    ProfilerRule,
    ProfilerConfig, 
    FrameworkProfile
)

rules = [
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.poor_weight_initialization()),
    Rule.sagemaker(base_config=rule_configs.class_imbalance(),
                   rule_parameters={"labels_regex": "CrossEntropyLoss_input_1"})
]

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=10)
)
debugger_hook_config = DebuggerHookConfig(
    hook_parameters={"train.save_interval": "100", "eval.save_interval": "10"}
)

In [7]:
# Since timeout - copy hypers from sagemaker best training job
lr = '0.0008486603267461696'
batch_size = '16'

In [None]:
# Create and fit an estimator
from time import time
begin = time()
hyperparameters = {'batch_size': batch_size,
                   'lr': lr} # Training with best parameters

instance_type='ml.g4dn.12xlarge'

estimator = PyTorch(
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type=instance_type,
    entry_point="train_model.py",
    framework_version="1.9",
    py_version="py38",
    hyperparameters=hyperparameters,
    profiler_config=profiler_config, # include the profiler hook
    debugger_hook_config=debugger_hook_config, # include the debugger hook
    rules=rules,
)

estimator.fit({'train': s3_data_dir},wait = True)
train_time=time() - begin
print(f'Creating Time: {train_time: .01f} s')

2022-01-19 11:46:11 Starting - Starting the training job...
2022-01-19 11:46:35 Starting - Launching requested ML instancesVanishingGradient: InProgress
Overfit: InProgress
Overtraining: InProgress
PoorWeightInitialization: InProgress
ClassImbalance: InProgress
ProfilerReport: InProgress
......
2022-01-19 11:47:37 Starting - Preparing the instances for training......
2022-01-19 11:48:40 Downloading - Downloading input data................................................
2022-01-19 11:56:40 Training - Downloading the training image...
2022-01-19 11:57:09 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-01-19 11:57:00,133 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-01-19 11:57:00,172 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[

#### Deploy Model: Since timeout problem happens, we get loss connection. Wait until the training job finishes, we will open new notebook to deploy the endpoint. 

In [None]:
# # Deploy your model to an endpoint
# # Add your deployment configuration like instance type and number of instances
# training_job_name = estimator.latest_training_job.name
# print(f'training_job_name = {training_job_name}')
# # No result.