In [2]:
import json
import os
from utils import nlp
import pandas as pd
import random
from random import shuffle, sample
random.seed(30)
from collections import Counter
import numpy as np


In [2]:
#Auxiliar functions
def slice_keys(key_list, proportion=0.3):
    return list(key_list)[:int(len(key_list) * proportion)]

def scramble(sentence):
    split = sentence.split()  # Split the string into a list of words
    shuffle(split)  # This shuffles the list in-place.
    return ' '.join(split)  # Turn the list back into a string

# Generate Dataset for Language Modeling GPT
Using the Multiwoz dataset to fine tuning gpt model to generate syntetic utterances

In [1]:
mw_base_dir = '../MultiWOZ_2.1/'

In [3]:
with open(mw_base_dir + 'valListFile.txt', 'r') as fp:
    val_list = fp.readlines()
    val_list = [elem.strip() for elem in val_list]

In [4]:
with open(mw_base_dir + 'testListFile.txt', 'r') as fp:
    test_list = fp.readlines()
    test_list = [elem.strip() for elem in test_list]

In [5]:
mw_data = json.load(open(mw_base_dir + 'data.json', 'r'))

In [6]:
train_list = set(mw_data.keys()) - set(test_list) - set(val_list)

In [84]:
def generate_dataset_lm_rnn(keys, filename):
    examples = []
    for key in keys:
        data = mw_data[key]['log']
        for i in range(0, len(data), 2):
            sys = data[i + 1]['text']
            examples.append(nlp.normalize(sys))
    with open(filename+'.txt', 'w') as f:
        for item in examples:
            f.write("%s \n" % item)
    
def generate_dataset_lm(keys, filename, add_label=False):
    examples = []
    for key in keys:
        data = mw_data[key]['log']
        for i in range(0, len(data), 2):
            sys = data[i + 1]['text']
            if add_label:
                examples.append((nlp.normalize(sys), '1'))
            else:
                examples.append(nlp.normalize(sys))
    with open(filename+'_lm.raw', 'w') as f:
        for item in examples:
            f.write("%s <|endoftext|> \n" % item)

In [85]:
generate_dataset_lm(train_list, 'train')
#generate_dataset_lm(val_list, 'valid')
#generate_dataset_lm(test_list, 'test')

In [91]:
len(test_list)

1000

# Pre process the utterances generated by SC-GPT
Extract the results from sc-gpt to calculate the coherence of the model

In [98]:
pred = json.load(open('multiwoz.pred.txt'))

In [105]:
test = []
for key in test_list:
    data = mw_data[key]['log']
    for i in range(0, len(data), 2):
        sys = data[i + 1]['text']
        test.append((nlp.normalize(sys).replace('"', ''), '1'))

In [106]:
gen_utt = []
for examples in pred:
    utt = examples[0]
    cl_idx = utt.find('<|endoftext|>')
    utt = utt[:cl_idx].strip().lower()
    gen_utt.append((nlp.normalize(utt), '0'))


In [108]:
len(gen_utt)

7372

In [112]:
df = pd.DataFrame(gen_utt + test)
df.columns = ['text', 'label']
df = df.sample(frac=1)

In [114]:
df.to_csv('mw_sc_gpt_mixed.csv', index=False)

# Generate examples for Binary classification GPT

In [7]:
gpt_gen_train = json.load(open('lm_gpt/gpt_generated_train.json'))
gpt_gen_test = json.load(open('lm_gpt/gpt_generated_test.json'))
gpt_gen_valid = json.load(open('lm_gpt/gpt_generated_valid.json'))

In [12]:
gpt_gen_train

['prompts for you. you can contact the cinema at 01223335088. goodbye. <|endoftext|>',
 'sorry bout hotel nights - 6 people starting at 11:00 and for 4 people each time. how many would you like? <|endoftext|>',
 'the tr8510 has an hour to travel to latinity. would you like to book that ticket? <|endoftext|>',
 'the tr5831 meets that requirement. would you like to book a seat? <|endoftext|>',
 'tha adress is in the north side of town and their address is 10612 cherry hinton road cherry hinton. would you like to book a room? <|endoftext|>',
 'where did you leave from and what time were you traveling? <|endoftext|>',
 'grafton hotel located at trinity road, cambridge, cb23qq. do you have a preference for the type of attraction you are interested in? <|endoftext|>',
 'their is not worth the price range to me, they will not be available on friday night if i can try and find a cheaper one, but there are several cheap tuesday -',
 'cityroomz aligns perfectly for you. the phone number is 01223

In [48]:
#BUILD DATASET FOR COHERENCE CLASSIFICATION
train = []
for key in train_list:
    data = mw_data[key]['log']
    for i in range(0, len(data), 2):
        sys = data[i + 1]['text']
        train.append((nlp.normalize(sys).replace('"', ''), '1'))
        #train.append((scramble(nlp.normalize(sys)), '0'))

In [36]:
lite = []
for elem in gpt_gen_train:
    idx = elem.find('<|endoftext|>')
    if idx != -1:
        elem = elem[:idx]
    lite.append((nlp.normalize(elem).replace('"', ''), '0'))

In [52]:
shuffle(train)

In [54]:
import pandas as pd
df = pd.DataFrame(train, columns=['text','label'])
df.to_csv('valid_sc_gpt_mixed.csv', index=False)

# Generate examples for Binary classification LSTM

In [39]:
base_lstm = 'lm_lstm/'
fin = open(base_lstm + 'generated_train.txt', 'r')
data = fin.read()
lstm_data = data.split('<eos>')
lstm_data = [elem.replace('\n', ' ').strip() for elem in lstm_data]

In [11]:
len(lstm_data)

54462

In [79]:
train = []
for key in val_list:
    data = mw_data[key]['log']
    for i in range(0, len(data), 2):
        sys = data[i + 1]['text']
        train.append((nlp.normalize(sys).replace('"', ''), '1'))
        #train.append((scramble(nlp.normalize(sys)), '0'))

In [91]:
for elem in lstm_data:
    val.append((nlp.normalize(elem), '0'))

In [92]:
df = pd.DataFrame(val, columns=['text','label'])
df = df.sample(frac=1)
df.to_csv('train_bert_lstm.csv', index=False)

In [93]:
df.shape

(113556, 2)

# Generate examples mixing GPT and LSTM

In [37]:
gpt_gen_train = json.load(open('lm_gpt/gpt_generated_train.json'))
gpt_gen_test = json.load(open('lm_gpt/gpt_generated_test.json'))
gpt_gen_valid = json.load(open('lm_gpt/gpt_generated_valid.json'))

In [85]:
lite = []
for elem in gpt_gen_valid:
    idx = elem.find('<|endoftext|>')
    if idx != -1:
        elem = elem[:idx]
    lite.append((nlp.normalize(elem).replace('"', ''), '0'))

In [86]:
len(lite)

7153

In [82]:
base_lstm = 'lm_lstm/'
fin = open(base_lstm + 'generated_valid.txt', 'r')
data = fin.read()
lstm_data = data.split('<eos>')
lstm_data = [elem.replace('\n', ' ').strip() for elem in lstm_data]

In [83]:
train_lstm = []
for elem in lstm_data:
    train_lstm.append((nlp.normalize(elem), '0'))

In [84]:
len(train_lstm)

7629

In [80]:
len(train)//2

3687

In [87]:
full = train + train_lstm[:3687] + lite[:3687]

In [88]:
shuffle(full)

In [90]:
df = pd.DataFrame(full, columns=['text','label'])
df = df.sample(frac=1)
df.to_csv('valid_bert_lstm_gpt.csv', index=False)

# Most frequent prompts system

In [48]:
train = []
for key in train_list:
    data = mw_data[key]['log']
    for i in range(0, len(data), 2):
        sys = data[i + 1]['text']
        train.append((nlp.normalize(sys), '1'))

In [49]:
len(train)

56778

In [60]:
df = pd.DataFrame(train)
df = df.rename(columns={0:'text', 1:'label'})
df['prompts'] = df.text.map(lambda x: ' '.join(x.split()[:2]))
df = df.groupby('prompts').count().sort_values('label', ascending=False).reset_index()
df.drop(df[['text','label']], axis=1, inplace=True)
newdf = pd.DataFrame(np.repeat(df.values,14,axis=0))
newdf.columns = df.columns
newdf = newdf.sample(frac=1)
newdf.to_csv('prompts_train.csv', index=False)

In [59]:
newdf.shape

(57568, 1)

# HDSA Lexicalization to calculate the coherence

In [173]:
os.listdir('hdsa')

['test_reference2.json',
 'results.txt.pred.BERT_dim128_w_domain.pred',
 'test.json']

In [175]:
hdsa_test = json.load(open('hdsa/test.json'))

In [183]:
dacts = {}
for elem in hdsa_test:
    diag_list = [] 
    for turn in elem['info']:
        diag_list.append(turn['act'])
    dacts[elem['file']] = diag_list

In [286]:
generated_hdsa = json.load(open('hdsa/results.txt.pred.BERT_dim128_w_domain.pred'))

In [217]:
#replacements like hotels => hotel -s
fin = open('hdsa/mapping.pair')
replacements = []
for line in fin.readlines():
    tok_from, tok_to = line.replace('\n', '').split('\t')
    replacements.append((' ' + tok_from + ' ', ' ' + tok_to + ' '))

In [223]:
dialogue_acts = {}
value_count = '[value_count]'
for fname, lst_dact in dacts.items():
    list_dact = []
    for dact in lst_dact:
        formatted_da = {}
        for slot, value in dact.items():
            if slot.split('-')[-1] == 'choice':
                formatted_da[value_count] = value
            elif value != 'none' and value != '?':
                key = '['+ slot.split('-')[0] + '_' + slot.split('-')[-1] + ']'
                for fromx, tox in replacements:
                    value = ' ' + value + ' '
                    value = value.replace(fromx, tox)[1:-1]
                formatted_da[key] = value
        list_dact.append(formatted_da)
    dialogue_acts[fname] = list_dact


In [288]:
# Replace the formated dialogue acts slot value into the utterances
cleaned = []
for fname, utts in generated_hdsa.items():
    for utterance, das in zip(utts, dialogue_acts[fname]):
        for slot, value in das.items():
            utterance = utterance.replace(slot, value)
        cleaned.append((utterance, '1'))

In [289]:
cleaned

[('there are 23 hotel -s that meet your criteria . do you have a preference for area ?',
  '1'),
 ('there are 9 guesthouses in the various . the [hotel_name] is [hotel_pricerange] -ly priced .',
  '1'),
 ('i would recommend the [hotel_name] . would you like me to book it for you ?',
  '1'),
 ('i can help you with that . what is your stay ?', '1'),
 ('i can book it for you .', '1'),
 ('yes , it is . would you like me to book it for you ?', '1'),
 ('your booking was successful . the table will be reserved for [value_count] minutes . your reference number is [restaurant_reference] . is there anything else i can help you with ?',
  '1'),
 ('you are welcome . have a great day !', '1'),
 ('nandos is a [restaurant_pricerange] [restaurant_food] restaurant in the city centre of town at 33-34 saint andrews street . would you like me to book a table for you ?',
  '1'),
 ('your booking was successful . the table will be reserved for [value_count] minutes . your reference number is [restaurant_refe

In [293]:
hdsa_df = pd.DataFrame(cleaned, columns=['text','label'])

hdsa_df.to_csv('test_hdsa.csv', index=False)