# Word level Conversational Model

## 1. Data set

We will use open dialogue data set based on booking a vacation - specifically, finding flights and a hotel, which is released by Microsoft. Original source of data can be found here: https://datasets.maluuba.com/Frames/dl

In [1]:
from __future__ import print_function

import os
import sys
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 100
import re
import json
import nltk
import string
from string import punctuation

DATA_PATH = 'data/chatbot'
EMBEDDING_PATH = 'embedding/chatbot'
MODEL_PATH = 'model/chatbot'

In [2]:
import _pickle as cPickle

# reading file in pickle format
def readPickle(pickleFilename):
    f = open(pickleFilename, 'rb')
    obj = cPickle.load(f)
    f.close()
    return obj

def savePickle(dataToWrite,pickleFilename):
    f = open(pickleFilename, 'wb')
    cPickle.dump(dataToWrite, f)
    f.close()

In [3]:
with open(os.path.join(DATA_PATH,'frames.json')) as json_file:
    data = json.load(json_file)

In [4]:
user_ids = []
ratings = []
# binary task labels for succeed task : True
task_labels = []
questions = []
answers = []

In [5]:
for i in range(len(data)):

	user_ids.append(data[i]['user_id'])
	ratings.append(data[i]['labels']['userSurveyRating'])
	task_labels.append(data[i]['labels']['wizardSurveyTaskSuccessful'])
	question = []
	answer = []
	for j in range(len(data[i]['turns'])):
		if(data[i]['turns'][j]['author'] == 'user'):
			question.append(data[i]['turns'][j]['text'])
		else:
			answer.append(data[i]['turns'][j]['text'])
	questions.append(question)
	answers.append(answer)

In [6]:
unique_userid = set() 
for userid in user_ids:
	if userid not in unique_userid:
		unique_userid.add(userid)

new_id = np.arange(len(unique_userid))
# for look up original userid and sorted version of user id
ind_userid = {}
userid_ind = {}

for i, userid in enumerate(unique_userid):
	ind_userid[new_id[i]] = userid
	userid_ind[userid] = new_id[i]


new_userids = []
for userid in user_ids:
	new_userids.append(userid_ind[userid])

In [7]:
user_chats = list(zip(new_userids,ratings,task_labels,questions,answers))

In [None]:
#UNCOMMENT TO STORE FILE
#savePickle(user_chats, os.path.join(DATA_PATH, 'user_chats'))
#savePickle(questions, os.path.join(DATA_PATH, 'questions'))
#savePickle(answers, os.path.join(DATA_PATH, 'answers'))

In [8]:
user_chats[0]

(5,
 4.0,
 True,
 ["I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
  'Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.',
  'I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How about that?',
  "I suppose I'll speak with my husband to see if we can choose other dates, and then I'll come back to you.Thanks for your help"],
 ['Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?',
  'I checked the availability for this date and there were no trips available.  Would you like to select some alternate dates?',
  'I checked the availability for that date and there were no trips available.  Would you like to select some alternate dates?'])

In [10]:
questions[0]

["I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
 'Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.',
 'I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How about that?',
 "I suppose I'll speak with my husband to see if we can choose other dates, and then I'll come back to you.Thanks for your help"]

In [11]:
answers[0]

['Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?',
 'I checked the availability for this date and there were no trips available.  Would you like to select some alternate dates?',
 'I checked the availability for that date and there were no trips available.  Would you like to select some alternate dates?']

In [12]:
qa = []
for list_q, list_a in zip(questions,answers):
    len_pair = min(len(list_q),len(list_a))
    new_q = list_q[:len_pair]
    new_a = list_a[:len_pair]
    for q,a in zip(new_q, new_a):
        qa.append((q,a))

In [14]:
qa[:3]

[("I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
  'Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?'),
 ('Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.',
  'I checked the availability for this date and there were no trips available.  Would you like to select some alternate dates?'),
 ('I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How about that?',
  'I checked the availability for that date and there were no trips available.  Would you like to select some alternate dates?')]

In [None]:
#UNCOMMENT TO STORE FILE
#savePickle(qa, os.path.join(DATA_PATH, 'qa_pair'))