# Generative ChatBot using Amazon SageMaker Seq2Seq built-in algorithm

## Setup

Let's start by specifying:
- The S3 bucket and prefix that you want to use for training and model data. **This should be within the same region as the Notebook Instance, training, and hosting.**
- The IAM role arn used to give training and hosting access to your data. See the documentation for how to create these. Note, if more than one role is required for notebook instances, training, and/or hosting, please replace the boto regexp in the cell below with a the appropriate full IAM role arn string(s).

In [2]:
# S3 bucket and prefix
bucket = 'sagemaker-generative-chatbot-exp'
experiment_name = 'GCB-BEST'
prefix = experiment_name

In [2]:
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

Install contraction framework to expand words like "i'm" into "i am", "we're" into "we are", etc

In [None]:
%%bash
pip install contractions

Next, we'll import the Python libraries we'll need for the remainder of the exercise.

In [None]:
from time import gmtime, strftime
from sentenceTokenizer import SentenceTokeniser
import contractions
import time
import numpy as np
import os
import json

# For plotting attention matrix later on
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

## Download dataset and preprocess

In [None]:
%%bash
aws s3 cp s3://sagemaker-generative-chatbot-exp/raw-training-data/cornell-movie-corpus . --recursive     

Create a conversation pairs from the raw movie dialog corpus.
1. We read all the chat lines into a dictionary
2. Stitch the chat line into conversation dialogs
3. Clean and expand the chat lines
4. Limit the Vocab to 15k to remove non popular words which makes it easier for the model to learn. Also remove the training sets with OOV (Out of Vocabulary)
5. Generate list of conversation pairs
6. Clean and expand the sentences
7. Split into train/val files

In [None]:
MAX_SENTENCE_LENGTH = 20    # 20 words max per sequence
MAX_TRAINING_SAMPLES = -1   # Take all the training samples with no OOV
MAX_VOCAB = 15000           # Cap vocab to 15k

train_val_split = 0.1
SPLIT_STRING = " +++$+++ "
chat_lines_data_path = "movie_lines.txt"
convo_data_path = "movie_conversations.txt"

sentence_tokenizer = SentenceTokeniser()

def clean_text(text):
    text = text.lower()

    # Remove any html/xml tag
    text = re.sub(r'<.*?>', '', text)
    text = text.replace("...", " ")
    text = text.replace("..", " ")
    text = text.replace(" - ", '. ')
    text = text.replace("-", ' ')
    text = text.replace("  ", ' ')

    text = contractions.fix(text)
    
    # Separate punctuations to reduce unecassery vocabs
    text = text.replace("?", " ?")
    text = text.replace("!", " !")
    text = text.replace(".", " .")
    text = text.replace(",", " ,")

    text = re.sub(r"  +", " ", text)
    text = re.sub(r"didn'", "did not", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    
    # Replace tryin' into trying but don't replace man's into mangs
    text = re.sub(r"n'[^s]", "ng", text)

    text = re.sub(r"[^a-z0-9,.?!' ]+", '', text)

    return text

def clean_final_sentences(sentences):
    cleaned_sentences = []
    for sentence in sentences:
        sentence = sentence.rstrip(".").rstrip(",")
        # Limit sentence to 20 words max
        sentence = " ".join(sentence.split(" ")[:MAX_SENTENCE_LENGTH])
        cleaned_sentences.append(sentence)

    return cleaned_sentences

def extract_ngram_pair(text_lines):
    sentences = []
    for part in text_lines:
        sub_sentences = sentence_tokenizer.tokenizeSentence(clean_text(part))
        cleaned_sub_sentences = clean_final_sentences(sub_sentences)
        sentences.extend(cleaned_sub_sentences)

    pairs = []

    for i in range(1, len(sentences), 1):
        qst = sentences[i - 1]
        ans = sentences[i]

        pairs.append((qst, ans))

    return pairs

def collect_vocab(new_pairs, vocabs):
    for pair in new_pairs:
        for i in range(2):
            words = pair[i].split(" ")
            for word in words:
                if word in vocabs:
                    vocabs[word] += 1
                else:
                    vocabs[word] = 1

def write_to_file(combined_path, source_path, target_path, pairs):
    with open(combined_path, "w") as f:
        for pair in pairs:
            f.write(f"{pair[0]} <--> {pair[1]}\n")
        f.close()

    with open(source_path, "w") as f:
        for pair in pairs:
            f.write(pair[0] + "\n")
        f.close()

    with open(target_path, "w") as f:
        for pair in pairs:
            f.write(pair[1] + "\n")
        f.close()

def sentence_in_vocab(sentence, vocabs):
    for word in sentence.split(" "):
        if word not in vocabs:
            return False
        
    return True

chat_lines = {}

with open(chat_lines_data_path, "r", errors='replace') as f:
    text_lines = f.readlines()

    for line in text_lines:
        tokens = line.split(SPLIT_STRING)
        line_id = tokens[0]
        line_convo = tokens[-1]
        chat_lines[line_id] = line_convo
        
conversation_list = []

with open(convo_data_path, "r") as f:
    text_lines = f.readlines()
    pairs = []

    for line in text_lines:
        tokens = line.split(SPLIT_STRING)
        convo_lines = re.sub(r"[^L0-9,]+", "", tokens[3]).split(",")

        cur_chat_lines = []
        for convo_line in convo_lines:
            cur_chat_lines.append(chat_lines[convo_line])

        conversation_list.append(cur_chat_lines)
        
pairs = []
vocabs = {}

# Generate a conversation pairs
for i, conversation in enumerate(conversation_list):
    new_pairs = extract_ngram_pair(conversation)
    collect_vocab(new_pairs, vocabs)
    pairs.extend(new_pairs)

# Get the most used vocabs
if MAX_VOCAB != -1:
    vocab_list = [(vocab, count) for vocab, count in vocabs.items()]
    vocab_list.sort(key=lambda tup: tup[1], reverse=True)
    
    vocab_list = vocab_list[:MAX_VOCAB]
    
    vocabs = {vocab[0]: True for vocab in vocab_list}
    
    # Remove pair which has word not in vocab
    used_pairs = []
    for pair in pairs:
        if sentence_in_vocab(pair[0], vocabs) and sentence_in_vocab(pair[1], vocabs):
            used_pairs.append(pair)
            
    print(f"Pairs len {len(pairs)} after vocab prunning {len(used_pairs)}")
    pairs = used_pairs
    
    print("Popular vocabs")
    for vocab in vocab_list[:10]:
        print(f"{vocab[0]} : Count={vocab[1]}\n")

if MAX_TRAINING_SAMPLES != -1:
    pairs = pairs[:MAX_TRAINING_SAMPLES]

# Dump the first 10
for pair in pairs[:10]:
    print(f"Q: {pair[0]}")
    print(f"A: {pair[1]}\n")

print(f"Vocab Len {len(vocabs)} Pairs Len {len(pairs)}")

num_val = int(len(pairs) * train_val_split)

train_combined_path = "train-combined.txt"
val_combined_path = "val-combined.txt"

train_source_path = "train-source.txt"
val_source_path = "val-source.txt"
train_target_path = "train-target.txt"
val_target_path = "val-target.txt"

write_to_file(val_combined_path, val_source_path, val_target_path, pairs[-num_val:])
write_to_file(train_combined_path, train_source_path, train_target_path, pairs[:-num_val])

Now, let's use the preprocessing script `create_vocab_proto.py` (provided with this notebook) to create vocabulary mappings (strings to integers) and convert these files to x-recordio-protobuf as required for training by SageMaker Seq2Seq.

In [None]:
%%time
%%bash
python3 create_vocab_proto.py \
        --train-source train-source.txt \
        --train-target train-target.txt \
        --val-source val-source.txt \
        --val-target val-target.txt

The script will output 4 files, namely:
- train.rec : Contains source and target sentences for training in protobuf format
- val.rec : Contains source and target sentences for validation in protobuf format
- vocab.src.json : Vocabulary mapping (string to int) for source sentences
- vocab.trg.json : Vocabulary mapping (string to int) for target sentences

Let's upload the pre-processed dataset and vocabularies to S3

In [None]:
def upload_to_s3(bucket, prefix, channel, file):
    s3 = boto3.resource('s3')
    data = open(file, "rb")
    key = prefix + "/" + channel + '/' + file
    s3.Bucket(bucket).put_object(Key=key, Body=data)

upload_to_s3(bucket, prefix, 'train', 'train.rec')
upload_to_s3(bucket, prefix, 'validation', 'val.rec')
upload_to_s3(bucket, prefix, 'vocab', 'vocab.src.json')
upload_to_s3(bucket, prefix, 'vocab', 'vocab.trg.json')

In [None]:
region_name = boto3.Session().region_name

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(region_name, 'seq2seq')

print('Using SageMaker Seq2Seq container: {} ({})'.format(container, region_name))

## Training the Generation model

In [None]:
job_name = experiment_name + strftime("%Y-%m-%d-%H-%M", gmtime())
print("Training job", job_name)

create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": "s3://{}/{}/".format(bucket, prefix)
    },
    "ResourceConfig": {
        # Seq2Seq does not support multiple machines. Currently, it only supports single machine, multiple GPUs
        "InstanceCount": 1,
        "InstanceType": "ml.p3.8xlarge", # We suggest one of ["ml.p2.16xlarge", "ml.p2.8xlarge", "ml.p2.xlarge"]
        "VolumeSizeInGB": 50
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        # Please refer to the documentation for complete list of parameters
        "max_seq_len_source": "20",
        "max_seq_len_target": "20",
        "optimized_metric": "bleu",
        "bleu_sample_size": "1000",
        "batch_size": "512",
        "checkpoint_frequency_num_batches": "1000",
        "rnn_num_hidden": "2048",
        "num_layers_encoder": "1",
        "num_layers_decoder": "1",
        "num_embed_source": "512",
        "num_embed_target": "512",
        "max_num_batches": "40100",
        "checkpoint_threshold": "3"
        # "max_num_batches": "2100"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 5 * 3600 # Run for maximum 5 hours first
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
        },
        {
            "ChannelName": "vocab",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/vocab/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
        }
    ]
}

sagemaker_client = boto3.Session().client(service_name='sagemaker')
sagemaker_client.create_training_job(**create_training_params)

status = sagemaker_client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)

In [None]:
status = sagemaker_client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)
# if the job failed, determine why
if status == 'Failed':
    message = sagemaker_client.describe_training_job(TrainingJobName=job_name)['FailureReason']
    print('Training failed with the following error: {}'.format(message))
    raise Exception('Training job failed')

> Now wait for the training job to complete and proceed to the next step after you see model artifacts in your S3 bucket.

You can jump to [Use a pretrained model](#Use-a-pretrained-model) as training might take some time.

## Inference

A trained model does nothing on its own. We now want to use the model to perform inference.
This section involves several steps,
- Create model - Create a model using the artifact (model.tar.gz) produced by training
- Create Endpoint Configuration - Create a configuration defining an endpoint, using the above model
- Create Endpoint - Use the configuration to create an inference endpoint.
- Perform Inference - Perform inference on some input data using the endpoint.

### Create model
We now create a SageMaker Model from the training output. Using the model, we can then create an Endpoint Configuration.

In [None]:
use_pretrained_model = False

### Use a pretrained model
#### Please uncomment and run the cell below if you want to use a pretrained model, as training might take several hours/days to complete.

In [None]:
#use_pretrained_model = True
#model_name = "your-model-name-here"
#!curl https://s3-us-west-2.amazonaws.com/seq2seq-data/model.tar.gz > model.tar.gz
#!curl https://s3-us-west-2.amazonaws.com/seq2seq-data/vocab.src.json > vocab.src.json
#!curl https://s3-us-west-2.amazonaws.com/seq2seq-data/vocab.trg.json > vocab.trg.json
#upload_to_s3(bucket, prefix, 'pretrained_model', 'model.tar.gz')
#model_data = "s3://{}/{}/pretrained_model/model.tar.gz".format(bucket, prefix)

In [None]:
%%time

sage = boto3.client('sagemaker')

if not use_pretrained_model:
    info = sage.describe_training_job(TrainingJobName=job_name)
    model_name=job_name
    model_data = info['ModelArtifacts']['S3ModelArtifacts']

print(model_name)
print(model_data)

primary_container = {
    'Image': container,
    'ModelDataUrl': model_data
}

create_model_response = sage.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container)

print(create_model_response['ModelArn'])

### Create endpoint configuration
Use the model to create an endpoint configuration. The endpoint configuration also contains information about the type and number of EC2 instances to use when hosting the model.

Since SageMaker Seq2Seq is based on Neural Nets, we could use an ml.p2.xlarge (GPU) instance, but for this example we will use a free tier eligible ml.m4.xlarge.

In [None]:
from time import gmtime, strftime

endpoint_config_name = 'GCB-EndpointConfig-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_config_name)
create_endpoint_config_response = sage.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':'ml.m4.xlarge',
        'InitialInstanceCount':1,
        'ModelName':model_name,
        'VariantName':'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])

### Create endpoint
Lastly, we create the endpoint that serves up model, through specifying the name and configuration defined above. The end result is an endpoint that can be validated and incorporated into production applications. This takes 10-15 minutes to complete.

In [None]:
%%time
import time

endpoint_name = experiment_name + '-Endpoint-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_name)
create_endpoint_response = sage.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name)
print(create_endpoint_response['EndpointArn'])

resp = sage.describe_endpoint(EndpointName=endpoint_name)
status = resp['EndpointStatus']
print("Status: " + status)

# wait until the status has changed
sage.get_waiter('endpoint_in_service').wait(EndpointName=endpoint_name)

# print the status of the endpoint
endpoint_response = sage.describe_endpoint(EndpointName=endpoint_name)
status = endpoint_response['EndpointStatus']
print('Endpoint creation ended with EndpointStatus = {}'.format(status))

if status != 'InService':
    raise Exception('Endpoint creation failed.')

If you see the message,
> Endpoint creation ended with EndpointStatus = InService

then congratulations! You now have a functioning inference endpoint. You can confirm the endpoint configuration and status by navigating to the "Endpoints" tab in the AWS SageMaker console.  

We will finally create a runtime object from which we can invoke the endpoint.

In [None]:
runtime = boto3.client(service_name='runtime.sagemaker') 

# Perform Inference

### Using JSON format for inference (Suggested for a single or small number of data instances)

In [None]:
sentences = []

sentences.extend(["how are you?",
             "hello",
             "what is your name?",
             "where do you live?",
             "i am very sad right now",
             "what is your favourite song?",
             "i do not want to talk to you anymore"
            ])

sentences.extend(["can you swim?",
             "where do you come from?",
             "do you know albert einstein?",
             "you suck bad!",
             "i am going to kill you",
             "do you want to go out with me?",
             "let us talk about something fun"
            ])

sentences.extend(["what is the purpose of life?",
             "what time it is now?",
             "do you have a hobby?",
             "who is your best friend?",
             "i do not know what you are talking about",
             "are you alive?",
             "do you join a hackathon team?"
            ])

sentences.extend(["what is your favourite sport?",
             "you are nasty!",
             "here you go, you did it again",
             "do you know who i am?",
             "do you know what i am capable of?",
             "whats your sisters number?",
             "which team are you on?"
            ])

sentences.extend(["huh?",
             "why is that?",
             "why did you kill friedman?",
             "super awesome",
             "you do not listen to me",
             "unbelievable!",
             "where did you go last night?"
            ])

sentences.extend(["how old are you?",
             "why do we dream?",
             "can we go out today?",
             "can you please shut up!",
             "i am sick of you",
             "i am super hungry right now",
             "what did you say again?"
            ])

payload = {"instances" : []}
for sent in sentences:
    payload["instances"].append({"data" : clean_text(sent)})

response = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType='application/json', 
                                   Body=json.dumps(payload))

response = response["Body"].read().decode("utf-8")
response = json.loads(response)
for i, pred in enumerate(response['predictions']):
    print(f"Human: {sentences[i]}\nJarvis: {pred['target']}\n")

# Stop / Close the Endpoint (Optional)

Finally, we should delete the endpoint before we close the notebook.

In [None]:
sage.delete_endpoint(EndpointName=endpoint_name)