The notebook generates utterance pairs from tagged utterances, where tags are {B-START, I-START, B-OTHER, I-OTHER, O}.

From the start of every conversation, i.e. utterances with B-START tag, an utterance-reponse pair is construct by assuming the next utterance in the conversation is a reponse. We stop the pairing if we reach the end of the conversation.

e.g.


# Import Libraries and Data

In [13]:
import pandas as pd
import os
import datetime
pd.set_option('display.max_colwidth', -1)
pd.set_option("display.max_rows", 1000)

In [15]:
df_to_generate = pd.read_csv('../data/labeled/iob-labeled-sent-final-060519-v2.csv', index_col=[0])

In [16]:
df_to_generate.head()

Unnamed: 0,old_para_index,para_index,sent,label,split_tag
0,0,0,By Jane Austen,O,train
1,63,1,Chapter 1,O,train
2,64,2,"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.",O,train
3,65,3,"However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.",O,train
4,66,4,"""My dear Mr Bennet,""",B-START,train


In [18]:
df_to_generate.loc[91:95]

Unnamed: 0,old_para_index,para_index,sent,label,split_tag
91,134,72,"""Come, Darcy,""",B-START,train
92,134,72,"""I must have you dance. I hate to see you standing about by yourself in this stupid manner. You had much better dance.""",I-START,train
93,135,73,"""I certainly shall not. You know how I detest it, unless I am particularly acquainted with my partner. At such an assembly as this it would be insupportable. Your sisters are engaged, and there is not another woman in the room whom it would not be a punishment to me to stand up with.""",B-OTHER,train
94,136,74,"""I would not be so fastidious as you are,""",B-OTHER,train
95,136,74,"""for a kingdom! Upon my honour, I never met with so many pleasant girls in my life as I have this evening; and there are several of them you see uncommonly pretty.""",I-OTHER,train


In [19]:
def add_chapters(df):
    chapter_dict = dict()
    chapter_tag = ''
    for i in df.index:
        curr_chapter_tag = df.loc[i]['chapter_tag']
        if curr_chapter_tag == '':
            if chapter_tag != '':
                chapter_dict[i] = chapter_tag
            else:
                chapter_tag = curr_chapter_tag
                chapter_dict[i] = chapter_tag
        else:
            if chapter_tag == curr_chapter_tag:
                chapter_dict[i] = chapter_tag
            else:
                chapter_tag = curr_chapter_tag
                chapter_dict[i] = chapter_tag
    return chapter_dict

In [20]:
df_to_generate['chapter_tag'] = df_to_generate['sent'].apply(lambda x: x if 'chapter ' in x.lower() else '')
df_to_generate['chapter_tag'] = list(x[1] for x in add_chapters(df_to_generate).items())

# Generate Utterance Pairs

In [21]:
df_utter = df_to_generate[df_to_generate['label']!='O']
utter_list = list(df_utter.apply(lambda row: row['sent'], axis=1).values)

print('Number of utterances: {}'.format(len(utter_list)))

utter_pair_list = []
for idx, i in enumerate(utter_list):
    if idx < (len(utter_list)-1):
        utter_pair_list.append((i,)+(utter_list[idx+1],))
utter_pair_list.append(('',)+('',))

df_pairs = pd.DataFrame()
df_pairs['utter_1'] = [x[0] for x in utter_pair_list]
df_pairs['utter_2'] = [x[1] for x in utter_pair_list]
df_pairs['label'] = df_utter['label'].values
df_pairs['chapter_tag'] = df_utter['chapter_tag'].values

indices_to_rm = []
for i in df_pairs.index:
    if (df_pairs.loc[i]['label'] == 'B-START') & (i!=0):
        indices_to_rm.append(i-1)

df_pairs = df_pairs.drop(indices_to_rm, axis=0)
df_pairs = df_pairs.reset_index()
del df_pairs['index']

test_set_chapters = ['Chapter {}'.format(x) for x in range(19,27)]
validation_set_chapters = ['Chapter {}'.format(x) for x in range(27,34)]
def custom_train_test_split(field):
    if field in test_set_chapters:
        return 'test'
    elif field in validation_set_chapters:
        return 'validation'
    else:
        return 'train'
df_pairs['split_tag'] = df_pairs['chapter_tag'].apply(lambda x: custom_train_test_split(x))

print('Generated {} utterance pairs'.format(df_pairs.shape[0]))

print('Saving to csv..')

# Save as csv
NAME = 'true'
dirname = os.path.dirname('__file__')
output_path = os.path.join(dirname, '../data/utterance_pairs/')
current_year = str(datetime.datetime.now())[0:10]
csv_name = '{}-utter-pairs-{}.csv'.format(NAME, current_year)
df_pairs.to_csv(output_path + csv_name)

print('Done')

Number of utterances: 1740
Generated 1512 utterance pairs
Saving to csv..
Done


# Preview

In [22]:
# preview
df_pairs_true = pd.read_csv('../data/utterance_pairs/true-utter-pairs-2019-05-06.csv', index_col=[0])

In [23]:
df_pairs_true.head(50)

Unnamed: 0,utter_1,utter_2,label,chapter_tag,split_tag
0,"""My dear Mr Bennet,""","""have you heard that Netherfield Park is let at last?""",B-START,Chapter 1,train
1,"""have you heard that Netherfield Park is let at last?""","""But it is,""",I-START,Chapter 1,train
2,"""But it is,""","""for Mrs Long has just been here, and she told me all about it.""",B-OTHER,Chapter 1,train
3,"""for Mrs Long has just been here, and she told me all about it.""","""Do you not want to know who has taken it?""",I-OTHER,Chapter 1,train
4,"""Do you not want to know who has taken it?""","""You want to tell me, and I have no objection to hearing it.""",B-OTHER,Chapter 1,train
5,"""You want to tell me, and I have no objection to hearing it.""","""Why, my dear, you must know, Mrs Long says that Netherfield is taken by a young man of large fortune from the north of England; that he came down on Monday in a chaise and four to see the place, and was so much delighted with it, that he agreed with Mr Morris immediately; that he is to take possession before Michaelmas, and some of his servants are to be in the house by the end of next week.""",B-OTHER,Chapter 1,train
6,"""Why, my dear, you must know, Mrs Long says that Netherfield is taken by a young man of large fortune from the north of England; that he came down on Monday in a chaise and four to see the place, and was so much delighted with it, that he agreed with Mr Morris immediately; that he is to take possession before Michaelmas, and some of his servants are to be in the house by the end of next week.""","""What is his name?""",B-OTHER,Chapter 1,train
7,"""What is his name?""","""Bingley.""",B-OTHER,Chapter 1,train
8,"""Bingley.""","""Is he married or single?""",B-OTHER,Chapter 1,train
9,"""Is he married or single?""","""Oh! Single, my dear, to be sure! A single man of large fortune; four or five thousand a year. What a fine thing for our girls!""",B-OTHER,Chapter 1,train


In [24]:
df_pairs_true_test =  df_pairs_true[df_pairs_true['split_tag']=='test']

In [25]:
df_pairs_true_test.shape

(148, 5)