In [None]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import boto3
bucket_name='my_bucket_name'
data_set='jpeg/train'
region = boto3.Session().region_name
# s3train_path = 'https://s3-{}.amazonaws.com/{}/{}'.format(region,bucket_name, data_set)
s3train_path='s3://{}/jpeg/train/'.format(bucket_name)

In [None]:
role= get_execution_role()
sess=sagemaker.Session()
training_image=get_image_uri(sess.boto_region_name, 'image-classification', repo_version='latest')

In [None]:
s3_output_location='s3://{}/{}/output'.format(bucket_name, 'jpeg')

The recommended input format for the Amazon SageMaker image classification algorithms is Apache MXNet RecordIO. With that format, the training process uses the data in pipe mode rather than file mode. Pipe mode is faster because it makes an stream of data instead of copying batches of data one by one. In pipe mode the training job does not have to wait for the data to arrive before starting the process on each bach. However, we can also have jpeg or png files and make it be used in pipe by using an augmented manifest file. Find more information here: https://docs.aws.amazon.com/sagemaker/latest/dg/image-classification.html

# make augmented manifest file for train and validation sets

In [None]:
path='path_to_train.csv'
df=pd.read_csv(path)
df.loc[:, 'file_name']= df.loc[:,'image_name']+'.jpg'

In [None]:
#make train and validation sets
from sklearn.utils import shuffle
df=shuffle(df)
val_size=int(df.shape[0]/10)
df_val=df[0:val_size]
df=df[val_size:]

In [None]:
#duplicate mallignant cases for 55 times to make the data balanced!
df_mallignant=df[df['target']==1]
for i in range(55):
    df=df.append(df_mallignant)
df = shuffle(df)

In [None]:
#make augmented manifest file for train dataset
df_manifest= df[['file_name', 'target']]
df_manifest['file_name']= 's3://{}/jpeg/train/'.format(bucket_name)+df_manifest['file_name']
df_manifest.columns=['source-ref', 'class']
out=json.dumps(json.loads(df_manifest.to_json(orient='records')))[1:-1].replace('}, {', "}\n{")
with open('../data/train_manifest', 'w') as f:
    f.write(out)
s3_client = boto3.client('s3')
s3_client.upload_file('../data/train_manifest', Bucket=bucket_name, Key='jpeg/train_manifest')

In [None]:
#make augmented manifest file for validation dataset
df_val_manifest= df_val[['file_name', 'target']]
df_val_manifest['file_name']= 's3://{}/jpeg/train/'.format(bucket_name)+df_val_manifest['file_name']
df_val_manifest.columns=['source-ref', 'class']
out=json.dumps(json.loads(df_val_manifest.to_json(orient='records')))[1:-1].replace('}, {', "}\n{")
with open('../data/validation_manifest', 'w') as f:
    f.write(out)
s3_client = boto3.client('s3')
s3_client.upload_file('../data/validation_manifest', Bucket=bucket_name, Key='jpeg/validation_manifest')

# train in pipe mode with augmented manifest files

In [None]:
# Create a model object set to using "Pipe" mode.
model = sagemaker.estimator.Estimator(training_image,
                                      role,
                                      train_instance_count=1,
                                      train_instance_type='ml.p3.2xlarge', 
                                      train_volume_size = 20, 
                                      train_max_run = 3600*5,
                                      input_mode = 'Pipe',
                                      output_path=s3_output_location,
                                      sagemaker_session=sess)
#                                     model_uri= 's3-address-of-an-existing-model')

In [None]:
base_hyperparameters=dict(
    use_pretrained_model=1,
    image_shape='3,224,224',
    num_classes=2,
    num_training_samples=58084,
)

hyperparameters={
    **base_hyperparameters,
    **dict(
        learning_rate=0.001,
        mini_batch_size=32, 
        num_layers= 101,
        epochs=4,
        top_k=2
    )
}

model.set_hyperparameters(**hyperparameters)

In [None]:
# Create a train data channel with S3_data_type as 'AugmentedManifestFile' and attribute names.
train_data = sagemaker.session.s3_input('s3://{}/jpeg/train_manifest'.format(bucket_name),
    distribution='FullyReplicated',
    content_type='application/x-recordio',
    s3_data_type='AugmentedManifestFile', # if recordIO files were use, it should have been set to's3Prefix'
    attribute_names=['source-ref', 'class'],
    input_mode='Pipe',
    record_wrapping='RecordIO') 

validation_data=sagemaker.session.s3_input('s3://{}/jpeg/validation_manifest'.format(bucket_name),
    distribution='FullyReplicated',
    content_type='application/x-recordio',
    s3_data_type='AugmentedManifestFile', 
    attribute_names=['source-ref', 'class'],
    input_mode='Pipe',
    record_wrapping='RecordIO') 

data_channels = {'train': train_data, 'validation':validation_data}

In [None]:
model.fit(inputs=data_channels, job_name='my_training_job_name', logs=True)

In [None]:
job=model.latest_training_job
job_name=job.name
print(f"available for download at: {model.output_path}/{job_name}/")

# batch inference

In [None]:
model_addr='s3://{}/jpeg/output/my_training_job_name/output/model.tar.gz'.format(bucket_name)

In [None]:
model = sagemaker.model.Model(
    model_data=model_addr, 
    image=training_image,
    role=role, 
    sagemaker_session=sess 
)  


In [None]:
# The location of the test dataset
batch_input = 's3://{}/jpeg/test'.format(bucket_name)

# The location to store the results of the batch transform job
batch_output = '{}/inference_output'.format(s3_output_location)

In [None]:
#I tried doing batch inference in pipe mode (by provding manifest files), but it does not work. 
#Does not raise an error, and does not produce the output files
#so, here I am doing it in file mode
transformer = model.transformer(instance_count=1, instance_type='ml.m4.xlarge', output_path=batch_output)
transformer.transform(data=batch_input, data_type='S3Prefix', content_type='image/jpeg', split_type=None)
transformer.wait()

In [None]:
#this is how I tied doing the batch inference in pipe mode
test_manifest='s3://{}/jpeg/test_manifest'.format(bucket_name)
transformer= model.transformer(instance_count=1, instance_type='ml.m4.4xlarge', assemble_with='Line',
                               output_path=s3_output_location)transformer.transform(data= test_manifest , 
    data_type='ManifestFile', 
    job_name='melanoma-first-transform-job-7',#if I do not give a name, it will do so automatically with date and time
    content_type='application/x-recordio',
    split_type='Line')
transformer.wait()

## make submission file on cloud

In [None]:
from io import BytesIO
import os, boto3
import json

output = "image_name,target\n"
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket='mykagglebuckettobeusedbyfastai', Prefix ='jpeg/outputbatch_inference_34_6epochs/ISIC_')
for page in page_iterator:
    for content in page['Contents']:
        client.download_file(Bucket='mykagglebuckettobeusedbyfastai', Key=content['Key'], Filename= 'tempFile')
        with open(str('tempFile')) as f:
            data = json.load(f)
        value = round(data['prediction'][1], 4)
        output = output + os.path.splitext(os.path.splitext(content['Key'][len('jpeg/output/batch_inference_34_6epochs'):])[0])[0] + ","+str(value) + "\n"
with open('submission.csv', 'w') as sub:
    sub.write(output)
client.upload_file('submission.csv', 'mykagglebuckettobeusedbyfastai', 'jpeg/submissions/submission_34_6epochs')

## make submission file on my local computer

Just keeping this code for reference

Before trying to make the submission file on cloud, I downloaded the whole folder (made a zip file) that contains the output of batch transform to my own computer. Then I made the submission file as follows:

In [None]:
import os;
import json

output = "image_name,target\n"
for file in os.listdir(path/'output_folder'):
    jpg_name = os.path.splitext(os.path.splitext(file)[0])[0]
    with open(str(path/'output_folder')+"/"+file) as f:
        data = json.load(f)
        value = round(data['prediction'][1], 4)
        output = output + jpg_name + ","+str(value) + "\n"

with open(path/'my_submission.csv', 'w') as sub:
    sub.write(output)

# Image Classification Hyperparameters

augmentation_type : crop:Randomlycroptheimageandfliptheimagehorizontally

image_shape : string. Typical image dimensions for image classification are '3, 224, 224'. This is similar to the ImageNet dataset.

lr_scheduler_factor : float. The ratio to reduce learning rate used in conjunction with the lr_scheduler_step parameter, defined as lr_new = lr_old * lr_scheduler_factor.

lr_scheduler_step : string. The epochs at which to reduce the learning rate. As explained in the lr_scheduler_factor parameter, the learning rate is reduced by lr_scheduler_factor at these epochs. For example, if the value is set to "10, 20", then the learning rate is reduced by lr_scheduler_factor after 10th epoch and again by lr_scheduler_factor after 20th epoch. The epochs are delimited by ",".

momentum : float. The momentum for sgd and nag, ignored for other optimizers.

num_layers : Number of layers for the network. For data with large image size (for example, 224x224 - like ImageNet), we suggest selecting the number of layers from the set [18, 34, 50, 101, 152, 200]. For data with small image size (for example, 28x28 - like CIFAR), we suggest selecting the number of layers from the set [20, 32, 44, 56, 110]. The number of layers in each set is based on the ResNet paper. For transfer learning, the number of layers defines the architecture of base network and hence can only be selected from the set [18, 34, 50, 101, 152, 200].

optimizer : default sgd, adam, rmsprop, nag

precision_dtype : Default value: float32. The precision of the weights used for training. The algorithm can use either single precision (float32) or half precision (float16) for the weights. Using half-precision for weights results in reduced memory consumption. Valid values: float32 or float16

resize : positive integer. Resizes the image before using it for training. The images are resized so that the shortest side has the number of pixels specified by this parameter. If the parameter is not set, then the training data is used without resizing.

use_weighted_loss : Flag to use weighted cross-entropy loss for multi-label classification (used only when multi_label = 1), where the weights are calculated based on the distribution of classes.

use_pretrained_model : Valid values: 0 or 1

mini_batch_size : integer, default 32

learning_rate : default 0.1

# Questions to look into when the need arise

### how to do image normalization with sagemaker image classification?
Fast ai does some image preprocessing (resize and transforms) and normalizatio is one of them. I cannot find how to do normalization in SageMaker Image classification there.

I got AUCROC of 0.85 with resnet34 and only 2 epochs when I used fastai. With sagemaker, I got 0.60. I managed to improve it to 0.81 by doing some preprocessing: 

fastai does some preprocessing, like random horizontal flip or crop centre, and it does the resizing to 224*224
These can be done in sagemaker by
```
        augmentation_type= 'crop',
        resize= 224
```
in hyperparameters but these are done by cpu. So cpu usage goes up (close to maximum) and GPU ussage stays low at 15-20%

### GPU usage
When doing preprocesisng, my CPU ussage is close to maximum and my GPU ussage is minimal. That is because the preprocessing tasks are done by CPU. I do not know what we can do about it. Things like resize might be good to be done once, and saved on the data. But random crop and horizontal flip I believe should be done on each epoch separately. 

For fastai I used file mode for transferring the data from s3 for training process. I read that pipe mode is faster because the gpu will not have to wait for files to be downloaded like file mode. instead, we will have a stream of data that makes the data available for gpu before it needs it. So I used pipe mode when I started with sagemaker image classification. 

I think it did speed up the process, compared to the time I that fastai needed, but since fastai did normalization as well, I cannot be sure. And I used a smaller batch size with fastai.

Nevertheless, the PUG ussage does not go over 40%, even if I do no preprocessing. I assume the GPU is still waiting for the data to arrive (even in pipe mode). I just read that if the data is not too big, such that it can fit in CPU memory, then it is better to use file mode, because it downloads the files once, and uses them for several epochs, while in pipe mode the data is transfered from S3 for every epoch. I checked the memory sizes and it looks like there is enough space for my training data if I use a ml.p3.2xlarge instance as it has 61G of memory. I am still not sure if I have to do something to let the algorithm know that I want it be kept in memory or it will do so automatically.

### one annoying issue:
I want to start training immediately to compare the utilizations.
I wish I could set env variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 so that it does not spend a lot of time running performance tests to find the best convolution algorithm. Not an easy job if you do not go into details of dockers. I'm not even sure what it does with this autotune. 