In [24]:
%%capture cap --no-stderr
!pip install pyarrow
!pip install spacy
!pip install pandarallel
!pip install fastparquet

In [17]:
%load_ext autoreload
%autoreload 2

## Load Data

In [128]:
import sagemaker
import pandas as pd
import numpy as np
import os

In [129]:
data_dir = "./data/"
data_path = os.path.join(data_dir, "commit_messages.parquet")
tagged_data_path = os.path.join(data_dir, "commit_messages_with_label.parquet")
tagged_trainset_path = os.path.join(data_dir, "labeled_commit_message_train.parquet")
tagged_devset_path = os.path.join(data_dir, "labeled_commit_message_dev.parquet")
tagged_testset_path = os.path.join(data_dir, "labeled_commit_message_test.parquet")
print(data_path)
print(tagged_data_path)
print("...", tagged_trainset_path)
print("...", tagged_devset_path)
print("...", tagged_testset_path)

./data/commit_messages.parquet
./data/commit_messages_with_label.parquet
... ./data/labeled_commit_message_train.parquet
... ./data/labeled_commit_message_dev.parquet
... ./data/labeled_commit_message_test.parquet


In [130]:
train_set = pd.read_parquet(tagged_trainset_path)
dev_set = pd.read_parquet(tagged_devset_path)
test_set = pd.read_parquet(tagged_testset_path)
print(train_set.shape, dev_set.shape, test_set.shape)
display(train_set.head())

(562500, 11) (187500, 11) (250000, 11)


Unnamed: 0,name,time_sec,subject,message,first_segment_message,identifier,length_ok,capital_first_token,not_period_end,imperative_mood,good_message
0,Magenik,1421858642,Timeline update,Timeline update\n,Timeline update\n,True,True,True,True,False,False
1,polytomous,1509246803,Extract parameter on breakCurrentBlock TF2John,Extract parameter on breakCurrentBlock TF2John\n,Extract parameter on breakCurrentBlock TF2John\n,False,True,True,True,True,False
2,mero,1348444108,updated changelog,updated changelog\n,updated changelog\n,True,True,False,True,True,False
3,YominCarr,1414094992,Cleanup,Cleanup\n,Cleanup\n,False,True,True,True,False,False
4,Anthony Fuentes,1556550569,[maint] Updating copyright,[maint] Updating copyright\n,[maint] Updating copyright\n,True,True,False,True,True,False


## Feature Engineering (Training Set)

In [126]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
import os

In [119]:
data_dir = "./data/"
data_path = os.path.join(data_dir, "commit_messages.parquet")
tagged_data_path = os.path.join(data_dir, "commit_messages_with_label.parquet")
tagged_trainset_path = os.path.join(data_dir, "labeled_commit_message_train.parquet")
tagged_devset_path = os.path.join(data_dir, "labeled_commit_message_dev.parquet")
tagged_testset_path = os.path.join(data_dir, "labeled_commit_message_test.parquet")
print(data_path)
print(tagged_data_path)
print("...", tagged_trainset_path)
print("...", tagged_devset_path)
print("...", tagged_testset_path)

./data/commit_messages.parquet
./data/commit_messages_with_label.parquet
... ./data/labeled_commit_message_train.parquet
... ./data/labeled_commit_message_dev.parquet
... ./data/labeled_commit_message_test.parquet


In [121]:
train_set = pd.read_parquet(tagged_trainset_path)
dev_set = pd.read_parquet(tagged_devset_path)
test_set = pd.read_parquet(tagged_testset_path)
print(train_set.shape, dev_set.shape, test_set.shape)
display(train_set.head())

(562500, 11) (187500, 11) (250000, 11)


Unnamed: 0,name,time_sec,subject,message,first_segment_message,identifier,length_ok,capital_first_token,not_period_end,imperative_mood,good_message
0,Magenik,1421858642,Timeline update,Timeline update\n,Timeline update\n,True,True,True,True,False,False
1,polytomous,1509246803,Extract parameter on breakCurrentBlock TF2John,Extract parameter on breakCurrentBlock TF2John\n,Extract parameter on breakCurrentBlock TF2John\n,False,True,True,True,True,False
2,mero,1348444108,updated changelog,updated changelog\n,updated changelog\n,True,True,False,True,True,False
3,YominCarr,1414094992,Cleanup,Cleanup\n,Cleanup\n,False,True,True,True,False,False
4,Anthony Fuentes,1556550569,[maint] Updating copyright,[maint] Updating copyright\n,[maint] Updating copyright\n,True,True,False,True,True,False


In [122]:
train_set_text_features = train_set.loc[:, ["subject", "first_segment_message"]]
train_set_nontext_features = train_set.loc[:, ["length_ok", "capital_first_token", "not_period_end"]]
train_set_labels = train_set.loc[:, "good_message"]
print(train_set_text_features.shape, train_set_nontext_features.shape, train_set_labels.shape)

(562500, 2) (562500, 3) (562500,)


In [127]:
tokenizer = Tokenizer(num_words=100000, lower=True, oov_token="<unk>")

for col in train_set_text_features.columns:
    print(f"Tokenizer Learn to fit on feature col <{col}> ... ")
    focus_texts = train_set_text_features[col]
    tokenizer.fit_on_texts(focus_texts)

print(f"Tokenizer contains {tokenizer.word_counts.items().__len__()} unique tokens...")
_ = [print(f"[{idx}]: <Token: {kv[0]}> --> <ID: {kv[1]}>") for idx, kv in enumerate(tokenizer.word_counts.items()) if idx < 5]

print("Tokenizer Started to Convert Text Features to Sequences ...")

Tokenizer Learn to fit on feature col <subject> ... 
Tokenizer Learn to fit on feature col <first_segment_message> ... 
Tokenizer contains 243430 unique tokens...
[0]: <Token: timeline> --> <ID: 280>
[1]: <Token: update> --> <ID: 110756>
[2]: <Token: extract> --> <ID: 1075>
[3]: <Token: parameter> --> <ID: 2622>
[4]: <Token: on> --> <ID: 36728>
Tokenizer Started to Convert Text Features to Sequences ...


## Pipeline Feature Engineering (Transform Feature)

In [159]:
TEXT_FEATURE_COLUMNS = ["subject", "first_segment_message"]
NONTEXT_FEATURE_COLUMNS = ["length_ok", "capital_first_token", "not_period_end"]
LABEL_COLUMNS = "good_message"
PRESET_TOKENIZER = tokenizer
MAX_LEN = 200

for focus_set_tag, focus_set in zip(["train", "dev", "test"], [train_set, dev_set, test_set]):
    print(f"\n>>> Working on {focus_set_tag} set")
    
    focus_set_text_features = focus_set.loc[:, TEXT_FEATURE_COLUMNS]
    focus_set_nontext_features = focus_set.loc[:, NONTEXT_FEATURE_COLUMNS]
    focus_set_labels = focus_set.loc[:, LABEL_COLUMNS]
    print(focus_set_text_features.shape, focus_set_nontext_features.shape, focus_set_labels.shape)

    
    TRANSFORMED_FEATURES_DICT = {"subject": None, "first_segment_message": None, "nontext_features": None}

    for col in focus_set_text_features.columns:
        print(f"Transforming the text features <{col}> ... ")
        focus_texts = focus_set_text_features[col]
        TRANSFORMED_FEATURES_DICT[col] = pad_sequences(tokenizer.texts_to_sequences(focus_set_text_features["subject"]), maxlen=MAX_LEN, padding="post", truncating="post")

    focus_set_nontext_features = focus_set_nontext_features.astype("int32").values
    TRANSFORMED_FEATURES_DICT["nontext_features"] = focus_set_nontext_features

    _ = [print(f"[{idx}]: <Key: {kv[0]}> --> <Shape of Attributes: {kv[1].shape}>") for idx, kv in enumerate(TRANSFORMED_FEATURES_DICT.items()) if idx < 5]
    
    CONCATENATED_FEATURES = np.hstack([v for v in TRANSFORMED_FEATURES_DICT.values()])
    CONCATENATED_LABELS = focus_set_labels.astype("int32").values.reshape(-1, 1)
    CONCATENATED_DATASET = pd.DataFrame(np.hstack([CONCATENATED_LABELS, CONCATENATED_FEATURES]))
    print(CONCATENATED_FEATURES.shape, CONCATENATED_LABELS.shape, CONCATENATED_DATASET.shape)
    CONCATENATED_DATASET.to_csv(f"data/encoded/encoded_{focus_set_tag}_set.csv", index=False, header=False)


>>> Working on train set
(562500, 2) (562500, 3) (562500,)
Transforming the text features <subject> ... 
Transforming the text features <first_segment_message> ... 
[0]: <Key: subject> --> <Shape of Attributes: (562500, 200)>
[1]: <Key: first_segment_message> --> <Shape of Attributes: (562500, 200)>
[2]: <Key: nontext_features> --> <Shape of Attributes: (562500, 3)>
(562500, 403) (562500, 1) (562500, 404)

>>> Working on dev set
(187500, 2) (187500, 3) (187500,)
Transforming the text features <subject> ... 
Transforming the text features <first_segment_message> ... 
[0]: <Key: subject> --> <Shape of Attributes: (187500, 200)>
[1]: <Key: first_segment_message> --> <Shape of Attributes: (187500, 200)>
[2]: <Key: nontext_features> --> <Shape of Attributes: (187500, 3)>
(187500, 403) (187500, 1) (187500, 404)

>>> Working on test set
(250000, 2) (250000, 3) (250000,)
Transforming the text features <subject> ... 
Transforming the text features <first_segment_message> ... 
[0]: <Key: subjec

## Localized Model Training

In [3]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Lambda, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.metrics import Accuracy
import tensorflow as tf
import pandas as pd
import os

In [23]:
def SimpleNet(input_dim=403, embedding_vocab_size=100000, embedding_dim=32, sequence_size=200):
    input_layer = Input(shape=(input_dim,))

    text_x = Lambda(lambda x: x[:, : sequence_size * 2])(input_layer)
    rule_x = Lambda(lambda x: x[:, sequence_size * 2 - input_dim :])(input_layer)

    text_emb = Embedding(embedding_vocab_size, embedding_dim)(text_x)

    text_pool = GlobalMaxPooling1D()(text_emb)

    concat_x = Concatenate()([text_pool, rule_x])

    x = Dense(32, activation='relu')(concat_x)
    output_layer = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=input_layer, outputs=output_layer)
        
    return model

In [11]:
def acquire_inputs(input_data_dict):
    """
    acquire the training set feature, label and validation set feature, label
    wrap them into the proper manner that fit into the model training procedure
    """
    
    train_data_path = os.path.join(input_data_dict['train'], "encoded_train_set.csv")
    validation_data_path = os.path.join(input_data_dict['validation'], "encoded_dev_set.csv")
    print("... Training Data Will be Acquired From: {}".format(train_data_path))
    print("... Validation Data Will be Acquired From: {}".format(validation_data_path))
    encoded_trainset = pd.read_csv(train_data_path, header=None)
    encoded_devset = pd.read_csv(validation_data_path, header=None)

    encoded_trainset_feature = encoded_trainset.iloc[:, 1:].values
    encoded_trainset_label = encoded_trainset.iloc[:, 0].values
    encoded_trainset = None
    print("Successfully Load Data, Shape of the features:", encoded_trainset_feature.shape)

    encoded_devset_feature = encoded_devset.iloc[:, 1:].values
    encoded_devset_label = encoded_devset.iloc[:, 0].values
    encoded_devset = None
    print("Successfully Load Data, Shape of the features:", encoded_devset_feature.shape)
    return encoded_trainset_feature, encoded_trainset_label, (encoded_devset_feature, encoded_devset_label)

encoded_data_dir = "./data/encoded"
channel_input_dirs = {"train": encoded_data_dir, "validation": encoded_data_dir, "test": encoded_data_dir}
x_train, y_train, validation_data = acquire_inputs(channel_input_dirs)
print(x_train[:5])
print(y_train[:5])
print(validation_data[0][:5])

... Training Data Will be Acquired From: ./data/encoded/encoded_train_set.csv
... Validation Data Will be Acquired From: ./data/encoded/encoded_dev_set.csv
Successfully Load Data, Shape of the features: (562500, 403)
Successfully Load Data, Shape of the features: (187500, 403)
[[2361    5    0 ...    1    1    1]
 [ 870  368   21 ...    1    1    1]
 [  29  187    0 ...    1    0    1]
 [ 109    0    0 ...    1    1    1]
 [2141  124  884 ...    1    0    1]]
[0 0 0 0 0]
[[   3   15   14 ...    1    1    1]
 [   7  186  743 ...    1    0    1]
 [ 445  338  276 ...    1    1    1]
 [   5  676    0 ...    1    1    1]
 [ 261    2 4321 ...    1    0    1]]


In [39]:
# Provided train function
def train(model, train_data_feature, train_data_label, validation_data, epochs=10, model_dir=None, verbose=1, batch_size=128):
    """
    This is the training method that is called by the tensorflow training script
    """
    model.summary()
    
    model_path = os.path.join(model_dir, "model.hdf5")
    print(f"Model will be saved to {model_path} ...")
    
    ckpt = tf.keras.callbacks.ModelCheckpoint(
        model_path, monitor='val_loss', verbose=1, save_best_only=True, 
        mode='auto', save_freq='epoch', options=None
    )
    
    model.fit(
        x=train_data_feature, y=train_data_label, 
        validation_data=validation_data,
        epochs=epochs, verbose=verbose,
        shuffle=True, batch_size=batch_size,
        callbacks = [ckpt]
    )
        
EMBEDDING_DIM = 32
EMBEDDING_VOCAB_SIZE = 100000
TOTAL_FEATURE_DIM = 403
SEQUENCE_SIZE = 200
BATCH_SIZE = 256
EPOCHS = 3

model = SimpleNet(input_dim = TOTAL_FEATURE_DIM, 
                  embedding_vocab_size = EMBEDDING_VOCAB_SIZE,
                  embedding_dim = EMBEDDING_DIM,
                  sequence_size = SEQUENCE_SIZE)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"])

model_dir = "./"
train(model, x_train, y_train, validation_data, epochs=3, model_dir=model_dir, verbose=1, batch_size = BATCH_SIZE)

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           [(None, 403)]        0                                            
__________________________________________________________________________________________________
lambda_22 (Lambda)              (None, 400)          0           input_12[0][0]                   
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 400, 32)      3200000     lambda_22[0][0]                  
__________________________________________________________________________________________________
global_max_pooling1d_11 (Global (None, 32)           0           embedding_11[0][0]               
___________________________________________________________________________________________

In [40]:
model_path = "model.hdf5"
model_reload = tf.keras.models.load_model(model_path)
model_reload.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           [(None, 403)]        0                                            
__________________________________________________________________________________________________
lambda_22 (Lambda)              (None, 400)          0           input_12[0][0]                   
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 400, 32)      3200000     lambda_22[0][0]                  
__________________________________________________________________________________________________
global_max_pooling1d_11 (Global (None, 32)           0           embedding_11[0][0]               
___________________________________________________________________________________________

## Sagemaker Model Training

In [3]:
import sagemaker
import boto3
import os

In [4]:
data_dir = "./data/encoded"
encoded_trainset_path = os.path.join(data_dir, "encoded_train_set.csv")
encoded_devset_path = os.path.join(data_dir, "encoded_dev_set.csv")
encoded_testset_path = os.path.join(data_dir, "encoded_test_set.csv")
print("Trainset:", encoded_trainset_path)
print("Devset:", encoded_devset_path)
print("Testset:", encoded_testset_path)

Trainset: ./data/encoded/encoded_train_set.csv
Devset: ./data/encoded/encoded_dev_set.csv
Testset: ./data/encoded/encoded_test_set.csv


### Upload Data to S3

In [5]:
# SageMaker session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
print(f"Using role: {role}")

# default S3 bucket
bucket = sagemaker_session.default_bucket()
print(f"Using bucket: {bucket}")

Using role: arn:aws:iam::259046265119:role/service-role/AmazonSageMaker-ExecutionRole-20200813T153989
Using bucket: sagemaker-us-east-1-259046265119


In [6]:
# upload to S3
tags = ["train", "validation", "test"]
upload_paths = [encoded_trainset_path, encoded_devset_path, encoded_testset_path]
input_data = {}

for tag, upload_path in zip(tags, upload_paths):
    prefix = f'commit_msg_data/{tag}'
    input_data[tag] = sagemaker_session.upload_data(path=upload_path, bucket=bucket, key_prefix=prefix)
    print(f"[{tag}]: {upload_path} --> {input_data[tag]}")

print('')
display(input_data)

[train]: ./data/encoded/encoded_train_set.csv --> s3://sagemaker-us-east-1-259046265119/commit_msg_data/train/encoded_train_set.csv
[validation]: ./data/encoded/encoded_dev_set.csv --> s3://sagemaker-us-east-1-259046265119/commit_msg_data/validation/encoded_dev_set.csv
[test]: ./data/encoded/encoded_test_set.csv --> s3://sagemaker-us-east-1-259046265119/commit_msg_data/test/encoded_test_set.csv



{'train': 's3://sagemaker-us-east-1-259046265119/commit_msg_data/train/encoded_train_set.csv',
 'validation': 's3://sagemaker-us-east-1-259046265119/commit_msg_data/validation/encoded_dev_set.csv',
 'test': 's3://sagemaker-us-east-1-259046265119/commit_msg_data/test/encoded_test_set.csv'}

In [7]:
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    if any([identifier in obj.key for identifier in ["csv"]]):
         print(obj.key)

commit_msg_data/test/encoded_test_set.csv
commit_msg_data/train/encoded_train_set.csv
commit_msg_data/validation/encoded_dev_set.csv


### Modeling: Create Tensorflow Estimator

In [10]:
from sagemaker.tensorflow import TensorFlow

# specify an output path
# prefix is specified above

output_prefix="commit_msg_model"
output_path = 's3://{}/{}'.format(bucket, output_prefix)
print(f"Model artifact will be output to: {output_path}")

# instantiate a pytorch estimator
estimator = TensorFlow(entry_point='train.py',
                       source_dir='codes', # this should be just "source" for your code
                       role=role,
                       framework_version='2.1',
                       train_instance_count=1,
                       train_instance_type='ml.m5.xlarge',
#                        output_path=output_path,
                       sagemaker_session=sagemaker_session,
                       py_version="py3",
                       hyperparameters={
                           'epochs': 5, # could change to higher
                           'batch_size': 128
                       })

train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Model artifact will be output to: s3://sagemaker-us-east-1-259046265119/commit_msg_model


In [11]:
estimator.fit(inputs = input_data)

2020-11-27 17:21:13 Starting - Starting the training job...
2020-11-27 17:21:15 Starting - Launching requested ML instances......
2020-11-27 17:22:31 Starting - Preparing the instances for training...
2020-11-27 17:23:08 Downloading - Downloading input data...
2020-11-27 17:23:36 Training - Downloading the training image..[34m2020-11-27 17:23:54,837 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-11-27 17:23:54,845 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m

2020-11-27 17:23:51 Training - Training image download completed. Training in progress.[34m2020-11-27 17:24:05,626 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-11-27 17:24:05,642 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-11-27 17:24:05,656 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)

UnexpectedStatusException: Error for Training job tensorflow-training-2020-11-27-17-21-12-971: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/usr/bin/python3 train.py --batch_size 128 --epochs 5 --model_dir s3://sagemaker-us-east-1-259046265119/tensorflow-training-2020-11-27-17-21-12-971/model"
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/callbacks.py", line 1029, in _save_model
    self.model.save(filepath, overwrite=True)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/network.py", line 1008, in save
    signatures, options)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/saving/save.py", line 112, in save_model
    model, filepath, overwrite, include_optimizer)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/saving/hdf5_format.py", line 92, in save_model_to_hdf5
    f = h5py.File(filepath, mode='w')
  File "/usr/local/lib/python3.6/dist-packages/h5py/_hl/files.py", line 408, in __init__
    swmr=swmr)
  File "/usr/local/lib/python3.6/dist-pac