## Initialize

### w2v sentence file load

In [1]:
import csv
import os

txt_file = os.getcwd() + "/word-vectors/w2v_sentences.txt"
with open(txt_file, 'r') as in_file:
    stripped = (line.strip() for line in in_file)
    lines = (line.split("\t") for line in stripped if line)
    with open('word-vectors/re_w2v_sentence.csv', 'w') as out_file:
        writer = csv.writer(out_file)
        writer.writerows(lines)


In [4]:
import pandas as pd

vector_filepath = os.getcwd() + "/word-vectors/re_w2v_sentence.csv"
w2v_embeddings = pd.read_csv(vector_filepath)
w2v_embeddings = w2v_embeddings.set_index("sentence")

w2v_embeddings = w2v_embeddings.iloc[:250, :]
w2v_embeddings.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,11990,11991,11992,11993,11994,11995,11996,11997,11998,11999
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
are there any eritrean restaurants in town,0.0076,-0.0623,0.0017,-0.0289,0.0707,-0.111401,0.051,0.0026,-0.088,0.290702,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
how about chinese food,-0.053399,0.098998,-0.077899,-0.099398,-0.043699,0.0072,0.042599,0.095098,-0.042499,0.280595,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
i would like the east part of town,0.058301,0.081301,-0.090101,0.0067,-0.002,0.0219,0.0386,0.0446,0.0027,0.104101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
could i get the address phone number and postcode of yu garden,0.022301,0.067102,0.0034,-0.015201,-0.039901,0.015701,0.0103,-0.030901,-0.059902,0.369412,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
thank you that is all the information i needed bye bye,-0.0574,-0.0133,-0.0752,-0.0557,-0.0753,0.0249,0.0056,-0.0379,0.0131,0.183299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
from sklearn.manifold import TSNE
import pickle

USE_PREMADE_TSNE = False

tsne_filepath = os.getcwd() + "/data/w2v_sentence_tsne.pkl"

if not USE_PREMADE_TSNE:
    
    tsne = TSNE(random_state=0)
    tsne_points = tsne.fit_transform(w2v_embeddings.values)
    with open(tsne_filepath, 'wb') as f:
        pickle.dump(tsne_points, f)
else:
    with open(tsne_filepath, 'rb') as f:
        tsne_points = pickle.load(f)

tsne_w2v = pd.DataFrame(tsne_points, index=w2v_embeddings.index, columns=['x_coord', 'y_coord'])

### state tagging
각 문장에 state 태깅하여 TSNE로 확인

In [8]:
import json
import codecs

training_file = os.getcwd() +'/data/woz/woz_train_en.json'

list = json.load(codecs.open(training_file, 'r', 'utf-8', 'ignore'))

state_list = []

for data in list:
    dial_list = data['dialogue']
    state = ''
    
    for dial_text in dial_list:
        system_transcript = dial_text['system_transcript'] # system_utterance
        transcript = dial_text['transcript'] # user_utterance
        belief_states = dial_text['turn_label']
        
        cur_states = ''
        if belief_states:
            for belief_state in belief_states:
                if belief_state[0] == 'request':
                    actions = 'request'
                    slots = belief_state[1]
                else:
                    actions = 'inform'
                    slots = belief_state[0]
                
                state = actions + ":" + slots
                cur_states += state+','
                #cur_states.append(state)
            
            state_list.append(cur_states) #state_list.append(tuple(cur_states))
        else:
            state_list.append(cur_states) #state_list.append(tuple(cur_states))
            
#print(state_list)
final_state_list = state_list[:250]

#len(bert_sentence_embeddings)

tsne_w2v['state'] = final_state_list
tsne_w2v.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
are there any eritrean restaurants in town,11.222504,-1.402923,"inform:food,"
how about chinese food,-9.173656,-7.457738,"inform:food,"
i would like the east part of town,2.106961,-6.623255,"inform:area,"
could i get the address phone number and postcode of yu garden,-10.767793,6.837674,"request:postcode,request:phone,request:address,"
thank you that is all the information i needed bye bye,-0.555479,4.604403,


In [9]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value
from bokeh.palettes import Pastel1

output_notebook()

## Visualization Level 1

Sentence를 intent별(request/inform/joint/etc) 로 구분한다.

In [11]:
inform_tsne = tsne_w2v[tsne_w2v['state'].str.contains('inform') & ~tsne_w2v['state'].str.contains('request')]
inform_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
are there any eritrean restaurants in town,11.222504,-1.402923,"inform:food,"
how about chinese food,-9.173656,-7.457738,"inform:food,"
i would like the east part of town,2.106961,-6.623255,"inform:area,"
hi im looking for a nice german restaurant,8.02013,-12.647038,"inform:food,"
how about european yeah,-8.047486,-7.822728,"inform:food,"


In [12]:
request_tsne = tsne_w2v[tsne_w2v['state'].str.contains('request') & ~tsne_w2v['state'].str.contains('inform')]
request_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
could i get the address phone number and postcode of yu garden,-10.767793,6.837674,"request:postcode,request:phone,request:address,"
great may i have the phone number please,-3.830417,-9.9359,"request:phone,"
can you please give me the address,0.106516,7.20214,"request:address,"
what is the area,5.600796,7.869204,"request:area,"
could you provide me with price range of cotto i also need their address and phone thank you,-3.832911,0.328324,"request:phone,request:address,request:price ra..."


In [13]:
joint_tsne = tsne_w2v[tsne_w2v['state'].str.contains('inform') & tsne_w2v['state'].str.contains('request')]
joint_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
can i have the address phone number and area for the most expensive one,-10.802096,6.624963,"inform:price range,request:phone,request:area,..."
id like to find some mediterranean food and need to know their address and phone number,1.890252,-10.143529,"inform:food,request:phone,request:address,"
yes please give me the one turkish food restaurants phone number and address please,-4.409922,-3.256064,"request:phone,request:address,inform:food,"
i dont care about the price range just recommend me a restaurant that serves european food i would also like to get the phone number and directions,5.410314,-6.135487,"inform:food,inform:price range,request:address..."
hi could you tell me the phone number and address of the nearest mexican restaurant,1.184799,6.270705,"inform:food,request:phone,request:address,"


In [14]:
etc_tsne = tsne_w2v[~tsne_w2v['state'].str.contains('request') & ~tsne_w2v['state'].str.contains('inform')]
etc_tsne.head()

Unnamed: 0_level_0,x_coord,y_coord,state
sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
thank you that is all the information i needed bye bye,-0.555479,4.604403,
thank you goodbye,-3.756387,3.128062,
thank you,-2.809107,1.09928,
thank you,-2.441259,0.685273,
thanks good night,-0.791337,-0.020191,


In [15]:
from bokeh.palettes import Pastel1

# create the plot and configure it
tsne_plot = figure(title='t-SNE W2V Sentence Embeddings - Level1: intent',
                   plot_width = 900,
                   plot_height = 600
                  ) #active_scroll='wheel_zoom'

colors = Pastel1[4]
tsne_plot.add_tools(HoverTool(tooltips = '@sentence'))
# adjust visual elements of the plot
tsne_plot.title.text_font_size = value('16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None


for i in range(4):
    plot_data = ''
    name = ''
    if i == 0:
        plot_data = ColumnDataSource(inform_tsne)
        name = 'inform'
    elif i == 1:
        plot_data = ColumnDataSource(request_tsne)
        name = 'request'
    elif i == 2:
        plot_data = ColumnDataSource(joint_tsne)
        name = 'joint'
    elif i == 3:
        plot_data = ColumnDataSource(etc_tsne)
        name = 'etc'
        
    tsne_plot.circle('x_coord', 'y_coord', source=plot_data,
                 #line_alpha=0.8, fill_alpha=0.1,
                 size=10, hover_line_color='black',
                 legend=name, color=colors[i]   
                )

tsne_plot.legend.location = "top_left"
tsne_plot.legend.click_policy="hide"

# show time!
show(tsne_plot);

## Visualization Level 2

Sentence를 goal별(food/price/area/store/joint/etc) 로 구분한다.  
store의 세부 goal 은 phone, address, postcode, name 이다.

In [21]:
pattern = r'^((inform|request):food,){1}$' 
food_tsne = tsne_w2v[tsne_w2v['state'].str.match(pattern)]
#food_tsne.head()

pattern = r'^((inform|request):price range,){1}$' #r'(^.*(price range){1}(,){1})$'
price_tsne = tsne_w2v[tsne_w2v['state'].str.match(pattern)]
#price_tsne.head()

pattern = r'^((inform|request):area,){1}$'
area_tsne = tsne_w2v[tsne_w2v['state'].str.match(pattern)]
#area_tsne.head()

pattern = r'phone|address|name|postcode'
store_tsne = tsne_w2v[tsne_w2v['state'].str.contains(pattern) & ~tsne_w2v['state'].str.contains('inform')]
#store_tsne.head()

pattern = r'phone|address|name|postcode'
joint_tsne = tsne_w2v[tsne_w2v['state'].str.contains(pattern) & tsne_w2v['state'].str.contains('inform')]
#joint_tsne.head()

etc_tsne = tsne_w2v[~tsne_w2v['state'].str.contains('request') & ~tsne_w2v['state'].str.contains('inform')]
#etc_tsne.head()

In [22]:
from bokeh.palettes import Pastel1

# create the plot and configure it
tsne_plot = figure(title='t-SNE W2V Sentence Embeddings - Level2: goal',
                   plot_width = 900,
                   plot_height = 600
                  ) #active_scroll='wheel_zoom'

colors = Pastel1[6]
tsne_plot.add_tools(HoverTool(tooltips = '@sentence'))
# adjust visual elements of the plot
tsne_plot.title.text_font_size = value('16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None


for i in range(6):
    plot_data = ''
    name = ''
    if i == 0:
        plot_data = ColumnDataSource(food_tsne)
        name = 'food'
    elif i == 1:
        plot_data = ColumnDataSource(price_tsne)
        name = 'price'
    elif i == 2:
        plot_data = ColumnDataSource(area_tsne)
        name = 'area'
    elif i == 3:
        plot_data = ColumnDataSource(store_tsne)
        name = 'store'
    elif i == 4:
        plot_data = ColumnDataSource(joint_tsne)
        name = 'joint'
    elif i == 5:
        plot_data = ColumnDataSource(etc_tsne)
        name = 'etc'
        
    tsne_plot.circle('x_coord', 'y_coord', source=plot_data,
                 #line_alpha=0.8, fill_alpha=0.1,
                 size=10, hover_line_color='black',
                 legend=name, color=colors[i]   
                )

tsne_plot.legend.location = "top_left"
tsne_plot.legend.click_policy="hide"

# show time!
show(tsne_plot);

## Visualization Level 3

Sentence를 intent + goal별로 구분한다.  
request + food / area / price / joint  
inform + food / area / price / joint  
joint - request와 inform이 동시에 등장할 경우  
etc - request와 inform 둘 다 등장하지 않을 경우

In [18]:
food_pattern = r'^(request:food,){1}$' 
request_food = tsne_w2v[tsne_w2v['state'].str.match(food_pattern)]
#request_food.head()

area_pattern = r'^(request:area,){1}$' 
request_area = tsne_w2v[tsne_w2v['state'].str.match(area_pattern)]

price_pattern = r'^(request:price range,){1}$' 
request_price = tsne_w2v[tsne_w2v['state'].str.match(price_pattern)]

joint_pattern = r'^(request:(food|area|price range|phone|address|name|postcode),){2,}$' 
request_joint = tsne_w2v[tsne_w2v['state'].str.match(joint_pattern)]

food_pattern = r'^(inform:food,){1}$' 
inform_food = tsne_w2v[tsne_w2v['state'].str.match(food_pattern)]
#request_food.head()

area_pattern = r'^(inform:area,){1}$' 
inform_area = tsne_w2v[tsne_w2v['state'].str.match(area_pattern)]

price_pattern = r'^(inform:price range,){1}$' 
inform_price = tsne_w2v[tsne_w2v['state'].str.match(price_pattern)]

joint_pattern = r'^(inform:(food|area|price range|phone|address|name|postcode),){2,}$' 
inform_joint = tsne_w2v[tsne_w2v['state'].str.match(joint_pattern)]

joint_tsne = tsne_w2v[tsne_w2v['state'].str.contains('inform') & tsne_w2v['state'].str.contains('request')]
etc_tsne = tsne_w2v[~tsne_w2v['state'].str.contains('request') & ~tsne_w2v['state'].str.contains('inform')]

In [20]:
from bokeh.palettes import Paired

# create the plot and configure it
tsne_plot = figure(title='t-SNE W2V Sentence Embeddings - Level3: intent + goal',
                   plot_width = 900,
                   plot_height = 600
                  ) #active_scroll='wheel_zoom'

colors = Paired[10]
tsne_plot.add_tools(HoverTool(tooltips = '@sentence'))
# adjust visual elements of the plot
tsne_plot.title.text_font_size = value('16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None


for i in range(10):
    plot_data = ''
    name = ''
    if i == 0:
        plot_data = ColumnDataSource(request_food)
        name = 'request_food'
    elif i == 1:
        plot_data = ColumnDataSource(request_area)
        name = 'request_area'
    elif i == 2:
        plot_data = ColumnDataSource(request_price)
        name = 'request_price'
    elif i == 3:
        plot_data = ColumnDataSource(request_joint)
        name = 'request_joint'
    elif i == 4:
        plot_data = ColumnDataSource(inform_food)
        name = 'inform_food'
    elif i == 5:
        plot_data = ColumnDataSource(inform_area)
        name = 'inform_area'
    elif i == 6:
        plot_data = ColumnDataSource(inform_price)
        name = 'inform_price'
    elif i == 7:
        plot_data = ColumnDataSource(inform_joint)
        name = 'inform_joint'
    elif i == 8:
        plot_data = ColumnDataSource(joint_tsne)
        name = 'joint'
    elif i == 9:
        plot_data = ColumnDataSource(etc_tsne)
        name = 'etc'
        
    tsne_plot.circle('x_coord', 'y_coord', source=plot_data,
                 #line_alpha=0.8, fill_alpha=0.1,
                 size=10, hover_line_color='black',
                 legend=name, color=colors[i]   
                )

tsne_plot.legend.location = "top_left"
tsne_plot.legend.click_policy="hide"

# show time!
show(tsne_plot);