# Imports 

In [9]:
%%capture
!pip install pandas-profiling[notebook]

In [1]:
%%capture
import pandas as pd
from pandas_profiling import ProfileReport

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.model_selection import train_test_split

import sagemaker
from sagemaker.tuner import (
    CategoricalParameter,
    ContinuousParameter,
    IntegerParameter,
    HyperparameterTuner,
)
from sagemaker.huggingface import HuggingFace

from sagemaker.debugger import (
    Rule, ProfilerRule, rule_configs, 
    DebuggerHookConfig, ProfilerConfig, FrameworkProfile,
)

# Dataset 

In [67]:
# load the csv files
train = pd.read_csv("train.csv") #, header=None, names=["quadrant", "lyrics"], skiprows=1)
valid = pd.read_csv("validation.csv") #, header=None, names=["quadrant", "lyrics"], skiprows=1)
test = pd.read_csv("test.csv") #, header=None, names=["quadrant", "lyrics"], skiprows=1)

# concatenate the data
data = pd.concat([train, valid, test]).reset_index(drop=True)
# data.drop("df_index", axis=1, inplace=True)

In [None]:
data

In [None]:
# perfomr a profiler/EDA report on the data
# using valid data since whole data could not fit in memory for profiling
profile = ProfileReport(train, title="Qauadrant Lyrics Dataset", html={"style": {"full_width": True}}) #, minimal=True)
profile.to_file("EDA.html")

In [6]:
# preprocess the lyrics column
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
def _get_contractions(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re
contractions, contractions_re = _get_contractions(contraction_dict)
def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_preprocessing(text):
    text = text.lower() # lower text
    text = replace_contractions(text) # remove contactions
    text = "".join("".join(text).replace("\n", " ").replace("\r", " ")) # remove \n and \r
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace symbols with space
    text = BAD_SYMBOLS_RE.sub('', text) # replace bad characters with nothing
    text = re.sub(r'[0-9]', '', text) # remove residual numbers
    text = text.strip()
    text = " ".join([word for word in text.split() if word not in STOPWORDS]) # remove stopwords
    
    return text

In [None]:
data["lyrics"] = data["lyrics"].apply(text_preprocessing)

In [None]:
# get lens and remove anything with less than 64
data["len"] = data["lyrics"].apply(len)
data.describe()

In [None]:
data = data.loc[data["len"] >= 64]
data.describe()

In [None]:
data.drop(labels=["len"], axis=1, inplace=True)

In [None]:
# split the preprocessed data
train_test = 0.1
train_valid = 0.8

train, test = train_test_split(data, test_size=train_test, random_state=0)
train, valid = train_test_split(train, train_size=train_valid, random_state=0)

print(train.shape, test.shape, valid.shape)

In [None]:
train.to_csv("preprocessed_train.csv", index=False)
test.to_csv("preprocessed_test.csv", index=False)
valid.to_csv("preprocessed_valid.csv", index=False)

In [None]:
# generate lyrics.txt and labels.txt for all datasets
def aggregate_lyrics(dataset, lyrics_path, labels_path):
    """
    Aggregate lyrics and their respective labels / quadrant
    :param dataset: path to data
    :param lyrics_path: path to lyrics .txt file
    :param labels_path: path to labels .txt file
    :return: None
    """
    
    data = pd.read_csv(dataset)
    failed_index = []   # indexes with faulty data
    
    lyrics = data["lyrics"]
    labels = data["quadrant"]
    
    with open(lyrics_path, "w") as f:
        for _ in range(lyrics.shape[0]):
            try:
                f.write("".join("".join(lyrics[_]).replace("\n", " ").replace("\r", " "))+"\n")
            except UnicodeEncodeError:
                failed_index.append(_)
                
    with open(labels_path, "w") as f:
        for _ in range(labels.shape[0]):
            try:
                if _ not in failed_index:
                    f.write(str(labels.iloc[_])+"\n")
            except UnicodeEncodeError:
                pass

In [None]:
dataset_list = ["preprocessed_train.csv", "preprocessed_test.csv", "preprocessed_valid.csv"]
lyrics_path_list = ["data/train/lyrics.txt", "data/test/lyrics.txt", "data/valid/lyrics.txt"]
labels_path_list = ["data/train/labels.txt", "data/test/labels.txt", "data/valid/labels.txt"]

In [None]:
for i in range(len(dataset_list)):
    aggregate_lyrics(dataset_list[i], lyrics_path_list[i], labels_path_list[i])

In [2]:
# upload data
session = sagemaker.Session()
bucket = session.default_bucket()
role = sagemaker.get_execution_role()

print("Default Bucket: {}".format(bucket))
print("RoleArn: {}".format(role))

Default Bucket: sagemaker-us-east-1-019026610741
RoleArn: arn:aws:iam::019026610741:role/service-role/AmazonSageMaker-ExecutionRole-20220119T080026


In [None]:
local_dir = "data"
prefix = "emotion_recognition_music_lyrics"
inputs = session.upload_data(path=local_dir, bucket=bucket, key_prefix=prefix)
print("input spec (in this case, just an S3 path): {}".format(inputs))

# Hyperparameter Tuning 

In [3]:
# hyperparameter ranges
hyperparameter_ranges = {
    "batch-size": CategoricalParameter([32, 64, 128]),
    "max-length": CategoricalParameter([64, 128]),
    "epochs": IntegerParameter(2, 4),
    "lr": ContinuousParameter(2e-5, 1e-4),
}

In [4]:
# objective metric definition
objective_metric_name = "average valid loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "average valid loss", "Regex": "Val loss: ([+-]?[0-9\\.]+)"}]

In [5]:
# estimator
estimator = HuggingFace(
    entry_point="hpo.py",
    role=role,
    py_version="py36",
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    pytorch_version="1.7.1",
    transformers_version ="4.6.1",
)

# tuner
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=4,
    max_parallel_jobs=2,
    objective_type=objective_type,
    early_stopping_type="Auto",
)

In [10]:
input_channels = {
    "train": "s3://sagemaker-us-east-1-019026610741/emotion_recognition_music_lyrics/train",
    "valid": "s3://sagemaker-us-east-1-019026610741/emotion_recognition_music_lyrics/valid",
    "test": "s3://sagemaker-us-east-1-019026610741/emotion_recognition_music_lyrics/test"
}

In [7]:
# fit your Hyperparameter Tuner with data channels included
tuner.fit(input_channels, wait=True)

....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................!


In [3]:
# describe the tuning job results
from sagemaker.analytics import HyperparameterTuningJobAnalytics

exp = HyperparameterTuningJobAnalytics(
  hyperparameter_tuning_job_name="huggingface-pytorch--220121-0739")

jobs = exp.dataframe()

jobs.sort_values("FinalObjectiveValue", ascending=0)

Unnamed: 0,batch-size,epochs,lr,max-length,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
1,"""32""",2.0,2.1e-05,"""128""",huggingface-pytorch--220121-0739-003-f7b88d9d,Stopped,1.355,2022-01-21 08:06:57+00:00,2022-01-21 08:25:14+00:00,1097.0
2,"""64""",4.0,9.1e-05,"""64""",huggingface-pytorch--220121-0739-002-8785eb2d,Completed,1.303,2022-01-21 07:42:15+00:00,2022-01-21 08:02:05+00:00,1190.0
3,"""64""",4.0,5.3e-05,"""128""",huggingface-pytorch--220121-0739-001-ac61d889,Completed,1.271,2022-01-21 07:42:08+00:00,2022-01-21 08:10:18+00:00,1690.0
0,"""64""",4.0,2.5e-05,"""128""",huggingface-pytorch--220121-0739-004-facb7740,Completed,1.267,2022-01-21 08:13:21+00:00,2022-01-21 08:42:40+00:00,1759.0


In [4]:
# attaching since Kernel died
BetterTrainingJobName = "huggingface-pytorch--220121-0739-004-facb7740"
attached_estimator = sagemaker.estimator.Estimator.attach(BetterTrainingJobName)


2022-01-21 08:42:40 Starting - Preparing the instances for training
2022-01-21 08:42:40 Downloading - Downloading input data
2022-01-21 08:42:40 Training - Training image download completed. Training in progress.
2022-01-21 08:42:40 Uploading - Uploading generated training model
2022-01-21 08:42:40 Completed - Training job completed


In [5]:
# get best estimator
best_estimator = attached_estimator
# best_estimator = tuner.best_estimator()
best_estimator.hyperparameters()

{'_tuning_objective_metric': 'average valid loss',
 'batch-size': '"64"',
 'epochs': '4',
 'lr': '2.4834388581766214e-05',
 'max-length': '"128"',
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"HuggingFace"',
 'sagemaker_estimator_module': '"sagemaker.huggingface.estimator"',
 'sagemaker_job_name': '"huggingface-pytorch-training-2022-01-21-07-39-31-847"',
 'sagemaker_program': '"hpo.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-019026610741/huggingface-pytorch-training-2022-01-21-07-39-31-847/source/sourcedir.tar.gz"'}

# Model Training and Evaluation

In [6]:
# get best hyperparameters
best_hyperparameters = {
    "batch-size": best_estimator.hyperparameters()["batch-size"].replace('"', ""),
    "max-length": best_estimator.hyperparameters()["max-length"].replace('"', ""),
    "epochs": best_estimator.hyperparameters()["epochs"],
    "lr": best_estimator.hyperparameters()["lr"],
}
best_hyperparameters

{'batch-size': '64',
 'max-length': '128',
 'epochs': '4',
 'lr': '2.4834388581766214e-05'}

In [7]:
# set up debugging and profiling rules and hooks
rules = [
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.poor_weight_initialization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=10)
)

debugger_config = DebuggerHookConfig(
    hook_parameters={"train.save_interval": "100", "eval.save_interval": "10"}
)

In [8]:
# create and fit estimator
estimator = HuggingFace(
    entry_point="train_model.py",
    role=role,
    py_version="py36",
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    pytorch_version="1.7.1",
    transformers_version ="4.6.1",
    hyperparameters=best_hyperparameters,
    rules=rules,
    profiler_config=profiler_config,
    debugger_hook_config=debugger_config,
)

In [None]:
estimator.fit(input_channels, wait=False)

2022-01-21 09:48:20 Starting - Starting the training job...
2022-01-21 09:48:39 Starting - Preparing the instances for trainingVanishingGradient: InProgress
Overfit: InProgress
Overtraining: InProgress
PoorWeightInitialization: InProgress
ProfilerReport: InProgress
.........
2022-01-21 09:50:12 Downloading - Downloading input data
2022-01-21 09:50:12 Training - Downloading the training image.....................
2022-01-21 09:53:43 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-01-21 09:53:43,223 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-01-21 09:53:43,244 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-01-21 09:53:43,251 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-0

In [3]:
attached_estimator_train = sagemaker.estimator.Estimator.attach("huggingface-pytorch-training-2022-01-21-09-48-20-112")


2022-01-21 10:51:19 Starting - Preparing the instances for training
2022-01-21 10:51:19 Downloading - Downloading input data
2022-01-21 10:51:19 Training - Training image download completed. Training in progress.
2022-01-21 10:51:19 Uploading - Uploading generated training model
2022-01-21 10:51:19 Completed - Training job completed


In [4]:
estimator = attached_estimator_train

In [15]:
# evalutaion metrics from the training job
estimator.logs()

2022-01-21 10:51:19 Starting - Preparing the instances for training
2022-01-21 10:51:19 Downloading - Downloading input data
2022-01-21 10:51:19 Training - Training image download completed. Training in progress.
2022-01-21 10:51:19 Uploading - Uploading generated training model
2022-01-21 10:51:19 Completed - Training job completed[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-01-21 09:53:43,223 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-01-21 09:53:43,244 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-01-21 09:53:43,251 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-01-21 09:53:43,743 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m{
    "additional_framework_parameters": {},
    "channel_inpu

**Evaluation Metrics Classification Report**

               precision    recall  f1-score   support
           0       0.46      0.61      0.53       548
           1       0.51      0.07      0.12       332
           2       0.38      0.73      0.50       504
           3       1.00      0.00      0.01       352
    accuracy                           0.42      1736
    macro avg      0.59      0.35      0.29      1736
    weighted avg   0.56      0.42      0.34      1736

In [16]:
# Plot a debugging output.
import boto3

session = boto3.session.Session()
region = session.region_name

training_job_name = estimator.latest_training_job.name
print(f"Training jobname: {training_job_name}")
print(f"Region: {region}")

Training jobname: huggingface-pytorch-training-2022-01-21-09-48-20-112
Region: us-east-1


In [17]:
from smdebug.profiler.analysis.notebook_utils.training_job import TrainingJob

tj = TrainingJob(training_job_name, region)
tj.wait_for_sys_profiling_data_to_be_available()

[2022-01-21 11:03:02.280 ip-172-16-19-57:30631 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None


ProfilerConfig:{'S3OutputPath': 's3://sagemaker-us-east-1-019026610741/', 'ProfilingIntervalInMilliseconds': 500, 'ProfilingParameters': {'DataloaderProfilingConfig': '{"StartStep": 0, "NumSteps": 10, "MetricsRegex": ".*", }', 'DetailedProfilingConfig': '{"StartStep": 0, "NumSteps": 10, }', 'FileOpenFailThreshold': '50', 'HorovodProfilingConfig': '{"StartStep": 0, "NumSteps": 10, }', 'LocalPath': '/opt/ml/output/profiler', 'PythonProfilingConfig': '{"StartStep": 0, "NumSteps": 10, "ProfilerName": "cprofile", "cProfileTimer": "total_time", }', 'RotateFileCloseIntervalInSeconds': '60', 'RotateMaxFileSizeInBytes': '10485760', 'SMDataParallelProfilingConfig': '{"StartStep": 0, "NumSteps": 10, }'}}
s3 path:s3://sagemaker-us-east-1-019026610741/huggingface-pytorch-training-2022-01-21-09-48-20-112/profiler-output


Profiler data from system is available


In [18]:
from smdebug.profiler.analysis.notebook_utils.timeline_charts import TimelineCharts

system_metrics_reader = tj.get_systems_metrics_reader()
system_metrics_reader.refresh_event_file_list()

view_timeline_charts = TimelineCharts(
    system_metrics_reader,
    framework_metrics_reader=None,
    select_dimensions=["CPU", "GPU"],
    select_events=["total"],
)

[2022-01-21 11:03:06.789 ip-172-16-19-57:30631 INFO metrics_reader_base.py:134] Getting 61 event files
select events:['total']
select dimensions:['CPU', 'GPU']
filtered_events:{'total'}
filtered_dimensions:{'GPUMemoryUtilization-nodeid:algo-1', 'CPUUtilization-nodeid:algo-1', 'GPUUtilization-nodeid:algo-1'}


In [19]:
rule_output_path = estimator.output_path + estimator.latest_training_job.job_name + "/rule-output"
print(f"You will find the profiler report in {rule_output_path}")

You will find the profiler report in s3://sagemaker-us-east-1-019026610741/huggingface-pytorch-training-2022-01-21-09-48-20-112/rule-output


In [20]:
! aws s3 ls {rule_output_path} --recursive
! aws s3 cp {rule_output_path} ./ --recursive

2022-01-21 10:50:07     463016 huggingface-pytorch-training-2022-01-21-09-48-20-112/rule-output/ProfilerReport/profiler-output/profiler-report.html
2022-01-21 10:50:07     326678 huggingface-pytorch-training-2022-01-21-09-48-20-112/rule-output/ProfilerReport/profiler-output/profiler-report.ipynb
2022-01-21 10:50:00        543 huggingface-pytorch-training-2022-01-21-09-48-20-112/rule-output/ProfilerReport/profiler-output/profiler-reports/BatchSize.json
2022-01-21 10:50:00     185160 huggingface-pytorch-training-2022-01-21-09-48-20-112/rule-output/ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json
2022-01-21 10:50:00       1992 huggingface-pytorch-training-2022-01-21-09-48-20-112/rule-output/ProfilerReport/profiler-output/profiler-reports/Dataloader.json
2022-01-21 10:50:00        323 huggingface-pytorch-training-2022-01-21-09-48-20-112/rule-output/ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json
2022-01-21 10:50:00       4824 huggingface-pytorch-tra

In [21]:
# Display the profiler output
import os

# get the autogenerated folder name of profiler report
profiler_report_name = [
    rule["RuleConfigurationName"]
    for rule in estimator.latest_training_job.rule_job_summary()
    if "Profiler" in rule["RuleConfigurationName"]
][0]

In [22]:
import IPython

IPython.display.HTML(filename=profiler_report_name + "/profiler-output/profiler-report.html")

Unnamed: 0,Description,Recommendation,Number of times rule triggered,Number of datapoints,Rule parameters
CPUBottleneck,"Checks if the CPU utilization is high and the GPU utilization is low. It might indicate CPU bottlenecks, where the GPUs are waiting for data to arrive from the CPUs. The rule evaluates the CPU and GPU utilization rates, and triggers the issue if the time spent on the CPU bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.",Consider increasing the number of data loaders or applying data pre-fetching.,51,7095,threshold:50  cpu_threshold:90  gpu_threshold:10  patience:1000
LowGPUUtilization,"Checks if the GPU utilization is low or fluctuating. This can happen due to bottlenecks, blocking calls for synchronizations, or a small batch size.","Check if there are bottlenecks, minimize blocking calls, change distributed training strategy, or increase the batch size.",38,7081,threshold_p95:70  threshold_p5:10  window:500  patience:1000
BatchSize,"Checks if GPUs are underutilized because the batch size is too small. To detect this problem, the rule analyzes the average GPU memory footprint, the CPU and the GPU utilization.","The batch size is too small, and GPUs are underutilized. Consider running on a smaller instance type or increasing the batch size.",27,7080,cpu_threshold_p95:70  gpu_threshold_p95:70  gpu_memory_threshold_p95:70  patience:1000  window:500
GPUMemoryIncrease,Measures the average GPU memory footprint and triggers if there is a large increase.,Choose a larger instance type with more memory if footprint is close to maximum available memory.,22,7081,increase:5  patience:1000  window:10
StepOutlier,"Detects outliers in step duration. The step duration for forward and backward pass should be roughly the same throughout the training. If there are significant outliers, it may indicate a system stall or bottleneck issues.","Check if there are any bottlenecks (CPU, I/O) correlated to the step outliers.",15,1010,threshold:3  mode:None  n_outliers:10  stddev:3
IOBottleneck,Checks if the data I/O wait time is high and the GPU utilization is low. It might indicate IO bottlenecks where GPU is waiting for data to arrive from storage. The rule evaluates the I/O and GPU utilization rates and triggers the issue if the time spent on the IO bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.,"Pre-fetch data or choose different file formats, such as binary formats that improve I/O performance.",0,7095,threshold:50  io_threshold:50  gpu_threshold:10  patience:1000
LoadBalancing,"Detects workload balancing issues across GPUs. Workload imbalance can occur in training jobs with data parallelism. The gradients are accumulated on a primary GPU, and this GPU might be overused with regard to other GPUs, resulting in reducing the efficiency of data parallelization.",Choose a different distributed training strategy or a different distributed training framework.,0,7081,threshold:0.2  patience:1000
MaxInitializationTime,Checks if the time spent on initialization exceeds a threshold percent of the total training time. The rule waits until the first step of training loop starts. The initialization can take longer if downloading the entire dataset from Amazon S3 in File mode. The default threshold is 20 minutes.,"Initialization takes too long. If using File mode, consider switching to Pipe mode in case you are using TensorFlow framework.",0,1010,threshold:20
Dataloader,"Checks how many data loaders are running in parallel and whether the total number is equal the number of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. If too small, it might lead to low GPU utilization. If too large, it might impact other compute intensive operations on CPU.",Change the number of data loader processes.,0,10,min_threshold:70  max_threshold:200

Unnamed: 0,mean,max,p99,p95,p50,min
Step Durations in [s],1.74,13.99,13.85,13.59,0.03,0.02


# Deployment

In [62]:
# get model data
mode_data = estimator.model_data

In [63]:
# create a model using inference.py file
from sagemaker.huggingface import HuggingFaceModel, HuggingFacePredictor
from sagemaker.predictor import json_deserializer, json_serializer

In [64]:
huggingface_model = HuggingFaceModel(
    model_data=mode_data,
    role=role,
    entry_point="inference.py",
    py_version="py36",
    pytorch_version="1.7.1",
    transformers_version ="4.6.1",
)

In [66]:
predictor = huggingface_model.deploy(initial_instance_count=1, instance_type='ml.m5.large', endpoint_name="lyrics-classifier-endpoint")
# predictor = HuggingFacePredictor("huggingface-pytorch-inference-2022-01-21-13-07-39-849")

-------!

In [68]:
predictor.serializer = sagemaker.serializers.JSONSerializer()
predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

In [72]:
# inferencing
test_data = test.head()[["quadrant", "lyrics"]]
test_data

Unnamed: 0,quadrant,lyrics
0,1,I let the beast in too soon I don't know how t...
1,1,"So be it, I’m your crowbar If that’s what I am..."
2,3,(Yeah) I can drink a whole Henessey fifth Some...
3,3,I certainly haven't been shopping for any new ...
4,2,Through the back window of our '59 wagon I wat...


In [79]:
for i in range(test_data.shape[0]):
    res = predictor.predict(test_data.iloc[i]["lyrics"])
    print(f"Prediction: {res}, Actual: {test_data.iloc[i]['quadrant']}")

Prediction: 2, Actual: 1
Prediction: 2, Actual: 1
Prediction: 0, Actual: 3
Prediction: 0, Actual: 3
Prediction: 2, Actual: 2
