In [1]:
#-*- coding: utf-8 -*-
import bert
from bert import tokenization
import os
import codecs
import pandas as pd

import copy

In [2]:
MAX_SEQ_LENGTH = 128
ROOT_DIR = os.path.expanduser('~')
DATA_DIR = ROOT_DIR+'/woz_data'
BERT_MODEL_DIR = ROOT_DIR+'/bert/model/multi_cased_L-12_H-768_A-12'

vocab_file = BERT_MODEL_DIR+'/vocab_20000.txt' #vocab_10000.txt
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=False)
sentence_embedding_file = ROOT_DIR + "/origin_neural-belief-tracker/word-vectors/bert_user_sentences_embedding.csv"

## BERT - Sentence Embedding

In [3]:
sentence_embeddings = pd.read_csv(sentence_embedding_file)

sentence_embeddings.index =sentence_embeddings['Unnamed: 0']
sentence_embeddings.index.names = ['sentence']
del sentence_embeddings['Unnamed: 0']
sentence_embeddings.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Are there any eritrean restaurants in town?,0.445517,-0.543198,-0.425,-0.486601,0.403626,-0.339012,-0.448305,0.84282,-0.064032,-0.150931,...,-0.182311,-0.289471,-0.025483,-0.037476,0.343326,-0.109394,-0.335828,-0.649123,0.120226,-0.425649
How about Chinese food?,0.295119,-0.582565,-0.444374,0.019054,0.1645,-0.628176,-0.567032,0.582216,-0.217666,-0.001501,...,-0.339613,-0.049944,0.080028,0.327487,0.049683,0.451287,-0.077035,-0.31637,-0.206729,-0.204711
I would like the East part of town.,0.012217,-0.63441,0.239384,-0.248139,-0.14251,-0.245911,-0.660891,0.691453,-0.053906,-0.439329,...,0.031305,-0.198137,0.228118,0.178347,0.111536,0.254294,-0.490635,-0.356648,0.2472,-0.090004
"Could I get the address, phone number, and postcode of Yu Garden?",0.263734,-0.028428,0.647046,-0.178054,0.188165,0.008362,-0.064064,0.242883,0.285656,0.148032,...,-0.005035,-0.095302,0.256719,-0.050806,-0.291455,-0.03276,-0.493539,-0.681756,0.062754,0.340716
Thank you. That is all the information I needed. Bye bye!,-0.158158,-0.503958,0.752504,0.130475,-0.192017,-0.129413,0.020714,0.96822,0.069989,-0.105455,...,-0.049231,-0.55691,0.16161,-0.50421,0.009984,0.350107,-0.448609,0.154082,0.056703,0.027637


In [4]:
from sklearn.manifold import TSNE
import pickle

USE_PREMADE_TSNE = False

tsne_filepath = DATA_DIR + "/tsne_en_user.pkl"

if not USE_PREMADE_TSNE:
    
    tsne = TSNE(random_state=0)
    tsne_points = tsne.fit_transform(sentence_embeddings.values)
    with open(tsne_filepath, 'wb') as f:
        pickle.dump(tsne_points, f)
else:
    with open(tsne_filepath, 'rb') as f:
        tsne_points = pickle.load(f)

tsne_bert = pd.DataFrame(tsne_points, index=sentence_embeddings.index, columns=['x_coord', 'y_coord'])

In [5]:
import json
import codecs

training_file = DATA_DIR +'/woz_train_ko_en.json'
list = json.load(codecs.open(training_file, 'r', 'utf-8', 'ignore'))

state_list = []

for data in list:
    dial_list = data['dialogue']
    state = ''
    
    for dial_text in dial_list:
        system_transcript = dial_text['system_transcript'] # system_utterance
        transcript = dial_text['transcript'] # user_utterance
        belief_states = dial_text['turn_label']
        
        cur_states = ''
        if belief_states:
            for belief_state in belief_states:
                if belief_state[0] == 'request':
                    actions = 'request'
                    slots = belief_state[1]
                else:
                    actions = 'inform'
                    slots = belief_state[0]
                
                state = actions + ":" + slots
                cur_states += state+','
                #cur_states.append(state)
            
            state_list.append(cur_states) #state_list.append(tuple(cur_states))
        else:
            state_list.append(cur_states) #state_list.append(tuple(cur_states))
            
tsne_bert['state'] = state_list[:len(tsne_bert)]
tsne_bert.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Are there any eritrean restaurants in town?,46.255379,-11.451684,"inform:food,"
How about Chinese food?,0.675646,10.311291,"inform:food,"
I would like the East part of town.,32.232765,6.058905,"inform:area,"
"Could I get the address, phone number, and postcode of Yu Garden?",4.746547,-38.593884,"request:postcode,request:phone,request:address,"
Thank you. That is all the information I needed. Bye bye!,-49.00201,-1.2656,


In [6]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

## Visualization Level 1

Sentence를 intent별(request/inform/joint/etc) 로 구분한다.

In [7]:
inform_tsne = tsne_bert[tsne_bert['state'].str.contains('inform') & ~tsne_bert['state'].str.contains('request')]
inform_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Are there any eritrean restaurants in town?,46.255379,-11.451684,"inform:food,"
How about Chinese food?,0.675646,10.311291,"inform:food,"
I would like the East part of town.,32.232765,6.058905,"inform:area,"
"Hi, I'm looking for a nice German restaurant.",22.951103,41.992664,"inform:food,"
How about European yeah?,-5.65695,5.34225,"inform:food,"


In [8]:
request_tsne = tsne_bert[tsne_bert['state'].str.contains('request') & ~tsne_bert['state'].str.contains('inform')]
request_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Could I get the address, phone number, and postcode of Yu Garden?",4.746547,-38.593884,"request:postcode,request:phone,request:address,"
Great! May I have the phone number please?,-14.019636,-36.717781,"request:phone,"
can you please give me the address?,-9.162428,-41.409996,"request:address,"
What is the area?,35.105995,-46.336845,"request:area,"
Could you provide me with price range of Cotto? I also need their address and phone. Thank you.,-1.893271,-15.511796,"request:phone,request:address,request:price ra..."


In [9]:
joint_tsne = tsne_bert[tsne_bert['state'].str.contains('inform') & tsne_bert['state'].str.contains('request')]
joint_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Can I have the address, phone number, and area for the most expensive one?",11.930347,-24.200754,"inform:price range,request:phone,request:area,..."
I'd like to find some Mediterranean food and need to know their address and phone number.,8.536683,-16.035639,"inform:food,request:phone,request:address,"
"Yes, please give me the one Turkish food restaurant's phone number and address please.",7.502439,-22.591951,"request:phone,request:address,inform:food,"
I don't care about the price range. Just recommend me a restaurant that serves European food. I would also like to get the phone number and directions.,13.331562,-2.218137,"inform:food,inform:price range,request:address..."
"Hi, could you tell me the phone number and address of the nearest Mexican restaurant?",9.291094,-27.68865,"inform:food,request:phone,request:address,"


In [10]:
etc_tsne = tsne_bert[~tsne_bert['state'].str.contains('request') & ~tsne_bert['state'].str.contains('inform')]
etc_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Thank you. That is all the information I needed. Bye bye!,-49.00201,-1.2656,
Thank you goodbye,-60.444393,34.956223,
Thank you!,-62.531506,1.757522,
thank you.,-68.350441,-13.813757,
"Thanks, good night!",-54.535778,14.131837,


In [11]:
from bokeh.palettes import Pastel1

# create the plot and configure it
tsne_plot = figure(title='t-SNE BERT Sentence Embeddings - Level1: intent',
                   plot_width = 900,
                   plot_height = 600
                  ) #active_scroll='wheel_zoom'

colors = Pastel1[4]
tsne_plot.add_tools(HoverTool(tooltips = '@sentence'))
# adjust visual elements of the plot
tsne_plot.title.text_font_size = value('16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None


for i in range(4):
    plot_data = ''
    name = ''
    if i == 0:
        plot_data = ColumnDataSource(inform_tsne)
        name = 'inform'
    elif i == 1:
        plot_data = ColumnDataSource(request_tsne)
        name = 'request'
    elif i == 2:
        plot_data = ColumnDataSource(joint_tsne)
        name = 'joint'
    elif i == 3:
        plot_data = ColumnDataSource(etc_tsne)
        name = 'etc'
        
    tsne_plot.circle('x_coord', 'y_coord', source=plot_data,
                 #line_alpha=0.8, fill_alpha=0.1,
                 size=10, hover_line_color='black',
                 legend=name, color=colors[i]   
                )

tsne_plot.legend.location = "top_left"
tsne_plot.legend.click_policy="hide"

# show time!
show(tsne_plot);

## Visualization Level 2

Sentence를 goal별(food/price/area/store/joint/etc) 로 구분한다.  
store의 세부 goal 은 phone, address, postcode, name 이다.

In [12]:
pattern = r'^((inform|request):food,){1}$' 
food_tsne = tsne_bert[tsne_bert['state'].str.match(pattern)]
food_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Are there any eritrean restaurants in town?,46.255379,-11.451684,"inform:food,"
How about Chinese food?,0.675646,10.311291,"inform:food,"
"Hi, I'm looking for a nice German restaurant.",22.951103,41.992664,"inform:food,"
How about European yeah?,-5.65695,5.34225,"inform:food,"
Turkish food.,-10.910595,16.575588,"inform:food,"


In [13]:
pattern = r'^((inform|request):price range,){1}$' #r'(^.*(price range){1}(,){1})$'
price_tsne = tsne_bert[tsne_bert['state'].str.match(pattern)]
price_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
I am looking for a moderately priced restaurant.,28.188881,34.521042,"inform:price range,"
"Moderate, please.",-14.094714,9.021927,"inform:price range,"
I do not have a preference.,-4.01459,38.302021,"inform:price range,"
"hello, i'm looking for a restaurant with cheap prices",24.854366,40.694176,"inform:price range,"
What is the price range?,38.867592,-34.992409,"request:price range,"


In [14]:
pattern = r'^((inform|request):area,){1}$'
area_tsne = tsne_bert[tsne_bert['state'].str.match(pattern)]
area_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
I would like the East part of town.,32.232765,6.058905,"inform:area,"
I don't care.,-7.159046,46.985428,"inform:area,"
I don't care about the area. Anywhere.,0.137633,39.66991,"inform:area,"
What is the area?,35.105995,-46.336845,"request:area,"
anywhere will do.,-4.048484,29.51589,"inform:area,"


In [15]:
pattern = r'phone|address|name|postcode'
store_tsne = tsne_bert[tsne_bert['state'].str.contains(pattern) & ~tsne_bert['state'].str.contains('inform')]
store_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Could I get the address, phone number, and postcode of Yu Garden?",4.746547,-38.593884,"request:postcode,request:phone,request:address,"
Great! May I have the phone number please?,-14.019636,-36.717781,"request:phone,"
can you please give me the address?,-9.162428,-41.409996,"request:address,"
Could you provide me with price range of Cotto? I also need their address and phone. Thank you.,-1.893271,-15.511796,"request:phone,request:address,request:price ra..."
I need the address and phone number please.,-4.406989,-27.780231,"request:phone,request:address,"


In [16]:
pattern = r'phone|address|name|postcode'
joint_tsne = tsne_bert[tsne_bert['state'].str.contains(pattern) & tsne_bert['state'].str.contains('inform')]
joint_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Can I have the address, phone number, and area for the most expensive one?",11.930347,-24.200754,"inform:price range,request:phone,request:area,..."
I'd like to find some Mediterranean food and need to know their address and phone number.,8.536683,-16.035639,"inform:food,request:phone,request:address,"
"Yes, please give me the one Turkish food restaurant's phone number and address please.",7.502439,-22.591951,"request:phone,request:address,inform:food,"
I don't care about the price range. Just recommend me a restaurant that serves European food. I would also like to get the phone number and directions.,13.331562,-2.218137,"inform:food,inform:price range,request:address..."
"Hi, could you tell me the phone number and address of the nearest Mexican restaurant?",9.291094,-27.68865,"inform:food,request:phone,request:address,"


In [17]:
etc_tsne = tsne_bert[~tsne_bert['state'].str.contains('request') & ~tsne_bert['state'].str.contains('inform')]
etc_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Thank you. That is all the information I needed. Bye bye!,-49.00201,-1.2656,
Thank you goodbye,-60.444393,34.956223,
Thank you!,-62.531506,1.757522,
thank you.,-68.350441,-13.813757,
"Thanks, good night!",-54.535778,14.131837,


In [18]:
from bokeh.palettes import Pastel1

# create the plot and configure it
tsne_plot = figure(title='t-SNE BERT Sentence Embeddings - Level2: goal',
                   plot_width = 900,
                   plot_height = 600
                  ) #active_scroll='wheel_zoom'

colors = Pastel1[6]
tsne_plot.add_tools(HoverTool(tooltips = '@sentence'))
# adjust visual elements of the plot
tsne_plot.title.text_font_size = value('16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None


for i in range(6):
    plot_data = ''
    name = ''
    if i == 0:
        plot_data = ColumnDataSource(food_tsne)
        name = 'food'
    elif i == 1:
        plot_data = ColumnDataSource(price_tsne)
        name = 'price'
    elif i == 2:
        plot_data = ColumnDataSource(area_tsne)
        name = 'area'
    elif i == 3:
        plot_data = ColumnDataSource(store_tsne)
        name = 'store'
    elif i == 4:
        plot_data = ColumnDataSource(joint_tsne)
        name = 'joint'
    elif i == 5:
        plot_data = ColumnDataSource(etc_tsne)
        name = 'etc'
        
    tsne_plot.circle('x_coord', 'y_coord', source=plot_data,
                 #line_alpha=0.8, fill_alpha=0.1,
                 size=10, hover_line_color='black',
                 legend=name, color=colors[i]   
                )

tsne_plot.legend.location = "top_left"
tsne_plot.legend.click_policy="hide"

# show time!
show(tsne_plot);

## Visualization Level 3

Sentence를 intent + goal별로 구분한다.  
request + food / area / price / joint  
inform + food / area / price / joint  
joint - request와 inform이 동시에 등장할 경우  
etc - request와 inform 둘 다 등장하지 않을 경우

In [19]:
food_pattern = r'^(request:food,){1}$' 
request_food = tsne_bert[tsne_bert['state'].str.match(food_pattern)]
#request_food.head()

area_pattern = r'^(request:area,){1}$' 
request_area = tsne_bert[tsne_bert['state'].str.match(area_pattern)]

price_pattern = r'^(request:price range,){1}$' 
request_price = tsne_bert[tsne_bert['state'].str.match(price_pattern)]

joint_pattern = r'^(request:(food|area|price range|phone|address|name|postcode),){2,}$' 
request_joint = tsne_bert[tsne_bert['state'].str.match(joint_pattern)]


In [20]:
food_pattern = r'^(inform:food,){1}$' 
inform_food = tsne_bert[tsne_bert['state'].str.match(food_pattern)]
#request_food.head()

area_pattern = r'^(inform:area,){1}$' 
inform_area = tsne_bert[tsne_bert['state'].str.match(area_pattern)]

price_pattern = r'^(inform:price range,){1}$' 
inform_price = tsne_bert[tsne_bert['state'].str.match(price_pattern)]

joint_pattern = r'^(inform:(food|area|price range|phone|address|name|postcode),){2,}$' 
inform_joint = tsne_bert[tsne_bert['state'].str.match(joint_pattern)]


In [21]:
joint_tsne = tsne_bert[tsne_bert['state'].str.contains('inform') & tsne_bert['state'].str.contains('request')]
etc_tsne = tsne_bert[~tsne_bert['state'].str.contains('request') & ~tsne_bert['state'].str.contains('inform')]

In [22]:
from bokeh.palettes import Paired

# create the plot and configure it
tsne_plot = figure(title='t-SNE BERT Sentence Embeddings - Level3: intent + goal',
                   plot_width = 900,
                   plot_height = 600
                  ) #active_scroll='wheel_zoom'

colors = Paired[10]
tsne_plot.add_tools(HoverTool(tooltips = '@sentence'))
# adjust visual elements of the plot
tsne_plot.title.text_font_size = value('16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None


for i in range(10):
    plot_data = ''
    name = ''
    if i == 0:
        plot_data = ColumnDataSource(request_food)
        name = 'request_food'
    elif i == 1:
        plot_data = ColumnDataSource(request_area)
        name = 'request_area'
    elif i == 2:
        plot_data = ColumnDataSource(request_price)
        name = 'request_price'
    elif i == 3:
        plot_data = ColumnDataSource(request_joint)
        name = 'request_joint'
    elif i == 4:
        plot_data = ColumnDataSource(inform_food)
        name = 'inform_food'
    elif i == 5:
        plot_data = ColumnDataSource(inform_area)
        name = 'inform_area'
    elif i == 6:
        plot_data = ColumnDataSource(inform_price)
        name = 'inform_price'
    elif i == 7:
        plot_data = ColumnDataSource(inform_joint)
        name = 'inform_joint'
    elif i == 8:
        plot_data = ColumnDataSource(joint_tsne)
        name = 'joint'
    elif i == 9:
        plot_data = ColumnDataSource(etc_tsne)
        name = 'etc'
        
    tsne_plot.circle('x_coord', 'y_coord', source=plot_data,
                 #line_alpha=0.8, fill_alpha=0.1,
                 size=10, hover_line_color='black',
                 legend=name, color=colors[i]   
                )

tsne_plot.legend.location = "top_left"
tsne_plot.legend.click_policy="hide"

# show time!
show(tsne_plot);