# Imports 

In [1]:
%%capture
!pip install pandas-profiling[notebook]

In [1]:
%%capture
import pandas as pd
from pandas_profiling import ProfileReport

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.model_selection import train_test_split

import sagemaker
from sagemaker.tuner import (
    CategoricalParameter,
    ContinuousParameter,
    IntegerParameter,
    HyperparameterTuner,
)
from sagemaker.huggingface import HuggingFace

# Dataset 

In [None]:
# load the csv files
train = pd.read_csv("train.csv") #, header=None, names=["quadrant", "lyrics"], skiprows=1)
valid = pd.read_csv("validation.csv") #, header=None, names=["quadrant", "lyrics"], skiprows=1)
test = pd.read_csv("test.csv") #, header=None, names=["quadrant", "lyrics"], skiprows=1)

# concatenate the data
data = pd.concat([train, valid, test]).reset_index(drop=True)
# data.drop("df_index", axis=1, inplace=True)

In [None]:
data

In [None]:
# perfomr a profiler/EDA report on the data
# using valid data since whole data could not fit in memory for profiling
profile = ProfileReport(train, title="Qauadrant Lyrics Dataset", html={"style": {"full_width": True}}) #, minimal=True)
profile.to_file("EDA.html")

In [None]:
# preprocess the lyrics column
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
def _get_contractions(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re
contractions, contractions_re = _get_contractions(contraction_dict)
def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_preprocessing(text):
    text = text.lower() # lower text
    text = replace_contractions(text) # remove contactions
    text = "".join("".join(text).replace("\n", " ").replace("\r", " ")) # remove \n and \r
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace symbols with space
    text = BAD_SYMBOLS_RE.sub('', text) # replace bad characters with nothing
    text = re.sub(r'[0-9]', '', text) # remove residual numbers
    text = text.strip()
    text = " ".join([word for word in text.split() if word not in STOPWORDS]) # remove stopwords
    
    return text

In [None]:
data["lyrics"] = data["lyrics"].apply(text_preprocessing)

In [None]:
# get lens and remove anything with less than 64
data["len"] = data["lyrics"].apply(len)
data.describe()

In [None]:
data = data.loc[data["len"] >= 64]
data.describe()

In [None]:
data.drop(labels=["len"], axis=1, inplace=True)

In [None]:
# split the preprocessed data
train_test = 0.1
train_valid = 0.8

train, test = train_test_split(data, test_size=train_test, random_state=0)
train, valid = train_test_split(train, train_size=train_valid, random_state=0)

print(train.shape, test.shape, valid.shape)

In [None]:
train.to_csv("preprocessed_train.csv", index=False)
test.to_csv("preprocessed_test.csv", index=False)
valid.to_csv("preprocessed_valid.csv", index=False)

In [None]:
# generate lyrics.txt and labels.txt for all datasets
def aggregate_lyrics(dataset, lyrics_path, labels_path):
    """
    Aggregate lyrics and their respective labels / quadrant
    :param dataset: path to data
    :param lyrics_path: path to lyrics .txt file
    :param labels_path: path to labels .txt file
    :return: None
    """
    
    data = pd.read_csv(dataset)
    failed_index = []   # indexes with faulty data
    
    lyrics = data["lyrics"]
    labels = data["quadrant"]
    
    with open(lyrics_path, "w") as f:
        for _ in range(lyrics.shape[0]):
            try:
                f.write("".join("".join(lyrics[_]).replace("\n", " ").replace("\r", " "))+"\n")
            except UnicodeEncodeError:
                failed_index.append(_)
                
    with open(labels_path, "w") as f:
        for _ in range(labels.shape[0]):
            try:
                if _ not in failed_index:
                    f.write(str(labels.iloc[_])+"\n")
            except UnicodeEncodeError:
                pass

In [None]:
dataset_list = ["preprocessed_train.csv", "preprocessed_test.csv", "preprocessed_valid.csv"]
lyrics_path_list = ["data/train/lyrics.txt", "data/test/lyrics.txt", "data/valid/lyrics.txt"]
labels_path_list = ["data/train/labels.txt", "data/test/labels.txt", "data/valid/labels.txt"]

In [None]:
for i in range(len(dataset_list)):
    aggregate_lyrics(dataset_list[i], lyrics_path_list[i], labels_path_list[i])

In [2]:
# upload data
session = sagemaker.Session()
bucket = session.default_bucket()
role = sagemaker.get_execution_role()

print("Default Bucket: {}".format(bucket))
print("RoleArn: {}".format(role))

Default Bucket: sagemaker-us-east-1-019026610741
RoleArn: arn:aws:iam::019026610741:role/service-role/AmazonSageMaker-ExecutionRole-20220119T080026


In [None]:
local_dir = "data"
prefix = "emotion_recognition_music_lyrics"
inputs = session.upload_data(path=local_dir, bucket=bucket, key_prefix=prefix)
print("input spec (in this case, just an S3 path): {}".format(inputs))

In [3]:
# Saving this to avoid uploading the inputs again)
# inputs = "s3://sagemaker-us-east-1-019026610741/emotion_recognition_music_lyric"
# print("input spec (in this case, just an S3 path): {}".format(inputs))

input spec (in this case, just an S3 path): s3://sagemaker-us-east-1-019026610741/emotion_recognition_music_lyric


# Hyperparameter Tuning 

In [3]:
# hyperparameter ranges
hyperparameter_ranges = {
    "batch-size": CategoricalParameter([32, 64, 128]),
    "max-length": CategoricalParameter([64, 128]),
    "epochs": IntegerParameter(2, 4),
    "lr": ContinuousParameter(2e-5, 1e-4),
}

In [4]:
# objective metric definition
objective_metric_name = "average valid loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "average valid loss", "Regex": "Val loss: ([+-]?[0-9\\.]+)"}]

In [5]:
# estimator
estimator = HuggingFace(
    entry_point="hpo.py",
    role=role,
    py_version="py36",
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    pytorch_version="1.7.1",
    transformers_version ="4.6.1",
)

# tuner
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=4,
    max_parallel_jobs=2,
    objective_type=objective_type,
    early_stopping_type="Auto",
)

In [6]:
input_channels = {
    "train": "s3://sagemaker-us-east-1-019026610741/emotion_recognition_music_lyrics/train",
    "valid": "s3://sagemaker-us-east-1-019026610741/emotion_recognition_music_lyrics/valid",
    "test": "s3://sagemaker-us-east-1-019026610741/emotion_recognition_music_lyrics/test"
}

In [7]:
# fit your Hyperparameter Tuner with data channels included
tuner.fit(input_channels, wait=True)

.................................................................................................................................................................................__s

Job ended with status 'Stopped' rather than 'Completed'. This could mean the job timed out or stopped early for some other reason: Consider checking whether it completed as you expect.



