In [3]:
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

In [4]:
region = sess.boto_session.region_name
BUCKET_URI = "s3://nerdatabucket"
BUCKET_NAME = "nerdatabucket"
DATASET_PATH = f"{BUCKET_URI}/amazon_product_tag_dataset.json"

In [29]:
%%writefile scripts/train.py

import pandas as pd
import json
import numpy as np
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate
from sklearn.model_selection import train_test_split
from datasets import Dataset

def create_tokenize_data(tag_dataset_json):
    tokenized_data = pd.DataFrame(columns=['tokens', 'ner_tags'])
    for key, value in tag_dataset.items():
        text = key
        tokens = text.split()
        labels = ['O'] * len(tokens)
        for start, end, tag in value['entities']:
            label_start_index = text[:start].count(" ")
            labels[label_start_index] = "B-APP"
            #check if there are any spaces in the labelled app
            spaces = text[start: end+1].count(" ")
            if spaces > 0:
                #add Intermediate App label
                for i in range(1, spaces+1):
                    labels[label_start_index+i] = 'I-APP'
        
        tokenized_data = pd.concat([tokenized_data, pd.DataFrame([{'tokens': tokens, 'text': text, 'ner_tags': labels}])], ignore_index=True)

    return tokenized_data

def tokenize_adjust_labels(all_samples_per_split, tokenizer):

    total_adjusted_labels = []
    label_names = {'O': 0, 'B-APP': 1, 'I-APP': 2}

    tokenized_samples = tokenizer(all_samples_per_split["text"])

    word_ids_list = tokenized_samples.word_ids()
    existing_label_ids = [-100] + [label_names[tag] for tag in all_samples_per_split["ner_tags"]] + [-100]

    tokenized_samples['labels'] = existing_label_ids

    return pd.Series(tokenized_samples)

def compute_metrics(p):
    label_names = {
        0: 'O', 1:'B-APP', 2: 'I-APP'
    }
    print(p)
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    print(predictions)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
        if(k not in flattened_results.keys()):
            flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results


if __name__ == "__main__":
    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--epochs", type=int, default=3)
    parser.add_argument("--train_batch_size", type=int, default=16)
    parser.add_argument("--eval_batch_size", type=int, default=16)
    parser.add_argument("--warmup_steps", type=int, default=500)
    parser.add_argument("--model_name", type=str)
    parser.add_argument("--learning_rate", type=str, default=5e-5)

    # Data, model, and output directories
    parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--data_file", type=str)
    args, _ = parser.parse_known_args()


    # load json data
    data_df = json.load(args.data_file)

    # create tokenized data - converts JSON format to BIO format for NER
    tokenized_data = create_tokenize_data(tag_dataset)

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    tokenized_data = tokenized_data.apply(lambda row: tokenize_adjust_labels(row, tokenizer), axis=1)

    data_collator = DataCollatorForTokenClassification(tokenizer)

    print("building training and testing datasets")

    # split dataset
    train_df, test_df = train_test_split(tokenized_data, test_size=0.3, random_state=42)

    # load dataset from pandas to HF
    train_data = Dataset.from_pandas(train_df, preserve_index=False)
    test_data = Dataset.from_pandas(test_df, preserve_index=False)

    # metric to monitor
    metric = evaluate.load("seqeval")

    id2label= {
        "0": "LABEL_0",
        "1": "LABEL_1",
        "2": "LABEL_2"
    }

    label2id= {
        "LABEL_0": "0",
        "LABEL_1": "1",
        "LABEL_2": "2"
    }

    
    #initialized base model
    model = AutoModelForTokenClassification.from_pretrained(args.model_name, id2label=id2label, label2id=label2id)


    # set training arguments
    training_args = TrainingArguments(
        output_dir=args.model_dir,
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.train_batch_size,
        per_device_eval_batch_size=args.eval_batch_size,
        warmup_steps=args.warmup_steps,
        evaluation_strategy="steps",
        logging_dir=f"{args.output_data_dir}/logs",
        learning_rate=float(args.learning_rate),
        remove_unused_columns=False
    )

    # initialized trainer job
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    # Persist model

    # evaluate model
    eval_result = trainer.evaluate(eval_dataset=test_data)

    # writes eval result to file which can be accessed later in s3 ouput
    with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
        print(f"***** Eval results *****")
        for key, value in sorted(eval_result.items()):
            writer.write(f"{key} = {value}\n")

    # Saves the model to s3
    trainer.save_model(args.model_dir)
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)

Overwriting scripts/train.py


In [30]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={
        'epochs': 3,
        'train_batch_size': 16,
        'eval_batch_size': 16,
        'learning_rate': 5e-5,
        'model_name': 'bert-base-uncased',
        'data_file': DATASET_PATH,
        'model_dir': BUCKET_URI
    }

In [33]:
huggingface_estimator = HuggingFace(entry_point='train.py',
                            source_dir='./scripts',
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            use_spot_instances=True,
                            role=role,
                            transformers_version='4.26.0',
                            pytorch_version='1.13.1',
                            py_version='py39',
                            hyperparameters = hyperparameters)

In [None]:
huggingface_estimator.fit()

In [None]:
from sagemaker.huggingface import HuggingFaceModel

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=f"{BUCKET_URI}/model.tar.gz",  
   role=role, 
   transformers_version="4.26.0", 
   pytorch_version="1.13", 
   py_version="py39", 
)

In [None]:
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.m5.xlarge"
)