# Alexa TaskBot Dialog State Tracking

## User Intent detection

In [2]:
import os
import numpy as np
import transformers
import json

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

model_finetuned = './twiz-intent-model'
with open(os.path.join(model_finetuned + '/all_intents.json'), 'r') as all_intents_json:
    all_intents = json.load(all_intents_json) # contains the written out names of intents. also implicitly

tokenizer_name = 'roberta-base' # try 'bert-base-uncased', 'bert-base-cased', 'bert-large-uncased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) # loads a tokenizer

model = AutoModelForSequenceClassification.from_pretrained(model_finetuned, 
                                                           num_labels=len(all_intents)) # Loads the BERT model weights


In [3]:
all_intents

['GetCuriositiesIntent',
 'GreetingIntent',
 'AMAZON.SelectIntent',
 'ShowStepsIntent',
 'IdentifyRestrictionsIntent',
 'ProvideUserNameIntent',
 'MoreOptionsIntent',
 'AMAZON.RepeatIntent',
 'AMAZON.HelpIntent',
 'QuestionIntent',
 'MoreDetailIntent',
 'AdjustServingsIntent',
 'GoToStepIntent',
 'SetTimerIntent',
 'OutOfScopeIntent',
 'AMAZON.FallbackIntent',
 'PreviousStepIntent',
 'TerminateCurrentTaskIntent',
 'ChitChatIntent',
 'CompleteTaskIntent',
 'NoneOfTheseIntent',
 'ShoppingIntent',
 'AMAZON.PauseIntent',
 'AMAZON.CancelIntent',
 'StartStepsIntent',
 'InappropriateIntent',
 'AMAZON.NoIntent',
 'SuggestionsIntent',
 'ResumeTaskIntent',
 'IngredientsConfirmationIntent',
 'NextStepIntent',
 'IdentifyProcessIntent',
 'NoRestrictionsIntent',
 'AMAZON.YesIntent',
 'SubstitutionIntent',
 'AMAZON.StopIntent']

In [5]:
agent_u = "I can help you finding delicious recipes. What kind of recipe would you like to search for?"
user_u = "Can you find me a chicken recipe?"

input_encoding = tokenizer.encode_plus(agent_u, user_u, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'IdentifyProcessIntent'

In [4]:
agent_u = "I can help you finding delicious recipes. What kind of recipe would you like to search for?"
user_u = "Show me the suggestions"

input_encoding = tokenizer.encode_plus(agent_u, user_u, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'SuggestionsIntent'

In [5]:
agent_u = "I can help you finding delicious recipes. What kind of recipe would you like to search for?"
user_u = "Show me the second recipe."

input_encoding = tokenizer.encode_plus(agent_u, user_u, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'AMAZON.SelectIntent'

In [17]:
agent_u = "I can help you finding delicious recipes. What kind of recipe would you like to search for?"
user_u = "Find me fish recipes."

input_encoding = tokenizer.encode_plus(agent_u, user_u, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'IdentifyProcessIntent'

In [8]:
agent_u = "Here is some information about Bacon and Tomato Pasta. It has a 4.8 star rating.  It is estimated to take about 35 minutes. It serves 4. Its difficulty level is Easy.  If this is not quite what you are looking for say, go back. Otherwise I can show you the ingredients or we can start the cooking."
user_u = "What are the recipe ingredients?"

input_encoding = tokenizer.encode_plus(agent_u, user_u, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'IngredientsConfirmationIntent'

In [9]:
agent_u = "Here is some information about Bacon and Tomato Pasta. It has a 4.8 star rating.  It is estimated to take about 35 minutes. It serves 4. Its difficulty level is Easy.  If this is not quite what you are looking for say, go back. Otherwise I can show you the ingredients or we can start the cooking."
user_u = "This looks great. Let's start the recipe."

input_encoding = tokenizer.encode_plus(agent_u, user_u, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'StartStepsIntent'

In [4]:
agent_u = "Let me know when you're ready to move on the next step."
user_u = "Go on"

input_encoding = tokenizer.encode_plus(agent_u, user_u, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'NextStepIntent'

In [5]:
agent_u = "Let me know when you're ready to move on the next step."
user_u = "go back."

input_encoding = tokenizer.encode_plus(agent_u, user_u, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
outputs = model(**input_encoding)

logits = outputs.logits
idx = logits.argmax(-1).item()
all_intents[idx]

'PreviousStepIntent'

## Zero-Shot Slot Filling as QA

In [6]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import torch as torch

model_name = "deepset/roberta-base-squad2"

model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [12]:
context = "I'd like a salad with tomatos, lettuce and strawberries."
question = "What are the ingredients?"

input_encoding = tokenizer.encode_plus(prompt, user_u, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
outputs = model(**input_encoding)


In [13]:
# Get the most likely beginning of answer with the argmax of the score
answer_start_scores = outputs.start_logits
answer_start = torch.argmax(answer_start_scores)

# Get the most likely end of answer with the argmax of the score
answer_end_scores = outputs.end_logits
answer_end = torch.argmax(answer_end_scores) + 1

print(answer_start)
print(answer_end)


tensor(17)
tensor(23)


In [15]:
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_encoding.input_ids[0][answer_start:answer_end]))

print(answer)


 tomatos, lettuce and strawberries


In [18]:
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)

qa_input = {
  'context': 'Yes. No. I''d like a salad with tomatos, lettuce and strawberries.',
  'question': 'What are the ingredients?'
}

answer = qa_pipeline(qa_input)

print(answer)

{'score': 0.3748808801174164, 'start': 30, 'end': 63, 'answer': 'tomatos, lettuce and strawberries'}


# Multimodal Conversations

In [10]:
import json as json

with open("recipes_data.json", "r") as read_file:
    data = json.load(read_file)

imgA = data['0']['images'][0]['url']
titleA = data['0']['displayName']
propA = "Serves " + str(data['0']['servings'])


In [11]:
from IPython.display import Video, Image, HTML, display

def displayResults(titleA, imgA, propA, titleB, imgB, propB, titleC, imgC, propC):
    display(HTML(f"""
    <div class ="row" style="margin-left:100px">
       <div class="col-xs-6">
        <div class ="images" style="display:inline-block;">
            <img src="{imgA}" class="img-responsive" width="80px">
        </div>
        <div class ="images" style="display:inline-block;">
                      {titleA} <br>
                      {propA} <br>
        </div>
        <div class ="images" style="display:inline-block;">
            <img src="{imgB}" class="img-responsive" width="80">
        </div>
        <div class ="images" style="display:inline-block;">
                      {titleB} <br>
                      {propB} <br>
        </div>
        <div class ="images" style="display:inline-block;">
            <img src="{imgC}" class="img-responsive" width="80">
        </div>
                      {titleC} <br>
                      {propC} <br>
        </div>
       </div>
    </div>
    """))

def displayStep(text, img):
    display(HTML(f"""
    <div class ="row" style="margin-left:100px">
        <img src="{img}" class="img-responsive" width="80px">
        {text}<br>
    </div>
        """))


In [12]:
# Turn 1
print(" BOT: Hello, I am a TaskBot and I can help you with cooking tasks. Which recipe would you like to do?")
print()
val = input("USER:")

# Turn 2
print()
print(" BOT: Great! These are the results I found:")
print()
displayResults(titleA, imgA, propA, titleA, imgA, propA, titleA, imgA, propA)    
print("      Which recipe would you like to do? Or, would you like to search for something different?")
print()
val = input("USER:")

# Turn 3
print()
print(" BOT: That looks delicious! Let's start!")
print()
displayStep(data['0']['instructions'][0]['stepText'], imgA)    

# Turn 4
print(" BOT: Say next when you're done.")
print()
val = input("USER:")
print()
displayStep(data['0']['instructions'][1]['stepText'], imgA)    

# Turn 5
print(" BOT: Say next when you're done.")
print()
val = input("USER:")


 BOT: Hello, I am a TaskBot and I can help you with cooking tasks. Which recipe would you like to do?



USER: Chicken recipes



 BOT: Great! These are the results I found:



      Which recipe would you like to do? Or, would you like to search for something different?



USER: I love these ones lets start the first



 BOT: That looks delicious! Let's start!



 BOT: Say next when you're done.



USER: next





 BOT: Say next when you're done.



USER: next
