# Import Libraries and Data

In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option("display.max_rows", 1000)

import sys
sys.path.append('../scripts')

from html_fiction_parser import *

%load_ext autoreload
%autoreload 2

In [4]:
sentences = get_fiction_sentences("https://www.gutenberg.org/files/1342/1342-h/1342-h.htm")

In [5]:
df = pd.DataFrame(sentences, columns=['text'])

index_to_exclude = [i for i in range(1,63)]
df = df.drop(index_to_exclude, axis=0)
df = df.reset_index()
df.shape

(2119, 2)

In [6]:
def get_num_quotes(sent):
    return sent.count('"')
    
def add_tag_to_dict(sent_index, sent, somedict, tag):
    global total_quotes_count
    num_utterance = int(get_num_quotes(sent)/2)
    somedict[sent_index] = (tag, total_quotes_count, num_utterance)
    return somedict
    
def is_quote_start_of_sent(sent):
    # must have capitalized alphabet after quote
    return bool(re.match('^"[A-Z]', sent))

def is_quote_end_of_sent(sent):
    return bool(re.match('.*"$', sent))

def tag_dialogue(sent_index, sent):
    global total_quotes_count, sent_dia_dict, sent_list
    
    num_quotes = get_num_quotes(sent)
    
    if num_quotes == 0:
        add_tag_to_dict(sent_index, sent, sent_dia_dict, 'narrative')
    
    elif num_quotes == 1:
        total_quotes_count += 1
        add_tag_to_dict(sent_index, sent, sent_dia_dict, 'letter')#letter-S

                
    elif num_quotes == 2:
        total_quotes_count += 2
        add_tag_to_dict(sent_index, sent, sent_dia_dict, 'utterance')

        
    elif num_quotes > 2:
        if (num_quotes%2) == 0:
            total_quotes_count += num_quotes
            add_tag_to_dict(sent_index, sent, sent_dia_dict, 'utterance')
        else:
            total_quotes_count += num_quotes
            add_tag_to_dict(sent_index, sent, sent_dia_dict, 'letter')

    return sent_dia_dict

In [7]:
doc = df['text'].values

total_quotes_count = 0
sent_dia_dict = dict()
for sent_index, sent in enumerate(doc):
    sent_dia_dict = tag_dialogue(sent_index, sent)

In [8]:
len(df)

2119

In [9]:
df = pd.DataFrame(list(sent_dia_dict.items()), columns=['para_index', 'tag'])

In [10]:
df.shape

(2119, 2)

In [11]:
df['para'] = doc
df['num_quotes_up_to_para'] = df['tag'].apply(lambda x: x[1])
df['num_utterances'] = df['tag'].apply(lambda x: x[2])
df['tag'] = df['tag'].apply(lambda x: x[0])


In [12]:
df['tag'].value_counts()

utterance    1289
narrative    779 
letter       51  
Name: tag, dtype: int64

In [13]:
# df[(df['para'].apply(lambda x: is_quote_start_of_sent(x)==False)) & (df['tag']=='utterance')]

In [144]:
def find_utterances_raw(para):
    quotes = re.findall('"([^"]*)"', para)
    return ['"' + i + '"' for i in quotes]

def remove_period_from_honorifics(para):
    regex = '(Miss.|Mrs.|Mr.|Ms.)'
    for i in re.findall(regex, para, flags=re.IGNORECASE):
        para = para.replace(i, i[:-1])
    return para

def replace_narrative_w_mark(sent_list):
    new_sent_list = []
    for sent in sent_list:
        if ((get_num_quotes(sent))%2==0) & ((get_num_quotes(sent)!=0)):
            new_sent_list.append(sent)
        else:
            new_sent_list.append('*')
    return ' '.join(new_sent_list)

def tokenize_para(para):
    para = remove_period_from_honorifics(para)
#     regex = '((?![.\s])[^."]*(?:"[^"]*[^".]"[^."]*)*(?:"[^"]+\."|\.))'
#     regex = '((?![.\s])[^."]*(?:"[^"]*[^".]"[^."]*)*(?:"[^"]+\."|\.|"[^"]+\?"|"[^"]+\!"))'
    regex = r'(("[^"]*")|([^"]*))'
    results = [i[0] for i in re.findall(regex, para)]
    tag_texts = [i for i in results if i]
    output_string = replace_narrative_w_mark(tag_texts)
    return output_string


def tokenize_para_b(para):
    para = remove_period_from_honorifics(para)
#     regex = '((?![.\s])[^."]*(?:"[^"]*[^".]"[^."]*)*(?:"[^"]+\."|\.))'
#     regex = '((?![.\s])[^."]*(?:"[^"]*[^".]"[^."]*)*(?:"[^"]+\."|\.|"[^"]+\?"|"[^"]+\!"))'
    regex = r'(("[^"]*")|([^"]*))'
    results = [i[0] for i in re.findall(regex, para)]
    tag_texts = [i for i in results if i]
    output_string = ' '.join(tag_texts)
    return output_string


            

In [16]:
teststring0 = '"Come here, child," cried her father as she appeared. "I have sent for you on an affair of importance. I understand that Mr. Collins has made you an offer of marriage. Is it true?" Elizabeth replied that it was. "Very well—and this offer of marriage you have refused?"'
print(teststring0)

"Come here, child," cried her father as she appeared. "I have sent for you on an affair of importance. I understand that Mr. Collins has made you an offer of marriage. Is it true?" Elizabeth replied that it was. "Very well—and this offer of marriage you have refused?"


In [17]:
tokenize_para(teststring0)

'"Come here, child," [N] "I have sent for you on an affair of importance. I understand that Mr Collins has made you an offer of marriage. Is it true?" [N] "Very well—and this offer of marriage you have refused?"'

In [18]:
teststring1 = '"My dear Mr. Bennet," replied his Ms. wife, "how can you be so tiresome! You must know that I am thinking of his marrying one of them."'
print(teststring1)

"My dear Mr. Bennet," replied his Ms. wife, "how can you be so tiresome! You must know that I am thinking of his marrying one of them."


In [19]:
tokenize_para(teststring1)

'"My dear Mr Bennet," [N] "how can you be so tiresome! You must know that I am thinking of his marrying one of them."'

In [20]:
find_utterances_raw(teststring1)

['"My dear Mr. Bennet,"',
 '"how can you be so tiresome! You must know that I am thinking of his marrying one of them."']

In [21]:
teststring2 = '"My dear Mr. Bennet," said his lady to him one day, "have you heard that Netherfield Park is let at last?"'
print(teststring2)

"My dear Mr. Bennet," said his lady to him one day, "have you heard that Netherfield Park is let at last?"


In [22]:
tokenize_para(teststring2)

'"My dear Mr Bennet," [N] "have you heard that Netherfield Park is let at last?"'

In [23]:
df['raw_utter_list'] = df['para'].apply(lambda x: find_utterances_raw(x))

In [24]:
df['tokenized_sent'] = df['para'].apply(lambda x: tokenize_para(x))

In [25]:
df.shape

(2119, 7)

# Cleaning Manual Labeled CSV

In [26]:
df_labeled = pd.read_csv('../../data/parsed-n-labeled-data/pnp-gutenberg-label-task-pride-and-prejudice-by-jane-austen.csv', index_col=[0])

In [27]:
df_labeled = df_labeled.reset_index()
df_labeled.columns = ['para_index', 'tag_old','para', 'position', 'speaker', 'type']

In [28]:
df_labeled['type'] = df_labeled['type'].replace(' c', 'c').replace('c ', 'c').fillna('')

In [29]:
df_labeled[df_labeled['position'].isnull()]

Unnamed: 0,para_index,tag_old,para,position,speaker,type
344,344,not,He was silent.,,,
1722,1722,not,Chapter 53,,,


In [30]:
df_labeled.loc[344, 'position'] = 'n'
df_labeled.loc[1722, 'position'] = 'n'

In [31]:
df_labeled['position'] = df_labeled['position'].apply(lambda x: x.lower())

In [32]:
df_labeled.loc[343]

para_index    343                                     
tag_old       dialogue-F                              
para          “How can you contrive to write so even?”
position      1                                       
speaker       NaN                                     
type                                                  
Name: 343, dtype: object

In [33]:
df_labeled['position'].value_counts()

n    785
1    689
2    507
3    99 
4    29 
5    8  
6    2  
Name: position, dtype: int64

In [34]:
df_labeled['type'].value_counts()

            1819
c           225 
letter      71  
emphasis    2   
quote       2   
Name: type, dtype: int64

In [35]:
def recount_convo(type_tag, position_tag):
    global convo_count
    
    if (type_tag == 'c') & (position_tag.lower()!='n'):
        convo_count += 1
        return 'c_{}'.format(convo_count)
    elif (type_tag == '') & (position_tag.lower()!='n'):
        return 'c_{}'.format(convo_count)
    elif (type_tag == 'letter'):
        return 'letter'
    elif (type_tag == 'quote'):
        return 'quote'
    else:
        return 'narrative'



In [36]:
convo_count = 0
df_labeled['new_type_tag'] = df_labeled.apply(lambda row: recount_convo(str(row['type']), str(row['position'])), axis=1)

In [37]:
df_labeled.head(10)

Unnamed: 0,para_index,tag_old,para,position,speaker,type,new_type_tag
0,0,not,By Jane Austen,n,,,narrative
1,1,not,Chapter 1,n,,,narrative
2,2,not,"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.",n,,,narrative
3,3,not,"However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.",n,,,narrative
4,4,dialogue-F,"“My dear Mr. Bennet,” said his lady to him one day, “have you heard that Netherfield Park is let at last?”",1,mother,c,c_1
5,5,not,Mr. Bennet replied that he had not.,n,,,narrative
6,6,dialogue-F,"“But it is,” returned she; “for Mrs. Long has just been here, and she told me all about it.”",1,mother,,c_1
7,7,not,Mr. Bennet made no answer.,n,,,narrative
8,8,dialogue-F,“Do you not want to know who has taken it?” cried his wife impatiently.,1,mother,,c_1
9,9,dialogue-F,"“You want to tell me, and I have no objection to hearing it.”",2,father,,c_1


# Merging

In [263]:
df_merged = pd.merge(df, df_labeled, on='para_index', how='left')

In [264]:
df_merged.shape

(2119, 13)

In [265]:
df_merged.head()

Unnamed: 0,para_index,tag,para_x,num_quotes_up_to_para,num_utterances,raw_utter_list,tokenized_sent,tag_old,para_y,position,speaker,type,new_type_tag
0,0,narrative,By Jane Austen,0,0,[],[N],not,By Jane Austen,n,,,narrative
1,1,narrative,Chapter 1,0,0,[],[N],not,Chapter 1,n,,,narrative
2,2,narrative,"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.",0,0,[],[N],not,"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.",n,,,narrative
3,3,narrative,"However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.",0,0,[],[N],not,"However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.",n,,,narrative
4,4,utterance,"""My dear Mr. Bennet,"" said his lady to him one day, ""have you heard that Netherfield Park is let at last?""",4,2,"[""My dear Mr. Bennet,"", ""have you heard that Netherfield Park is let at last?""]","""My dear Mr Bennet,"" [N] ""have you heard that Netherfield Park is let at last?""",dialogue-F,"“My dear Mr. Bennet,” said his lady to him one day, “have you heard that Netherfield Park is let at last?”",1,mother,c,c_1


In [266]:
columns_i_want = ['para_index', 'para_x', 'num_utterances', 'raw_utter_list', 'tokenized_sent', 'position', 'speaker', 'new_type_tag']
df_merged = df_merged[columns_i_want]

In [267]:
df_merged = df_merged.fillna('')

In [268]:
df_merged.columns

Index(['para_index', 'para_x', 'num_utterances', 'raw_utter_list',
       'tokenized_sent', 'position', 'speaker', 'new_type_tag'],
      dtype='object')

In [269]:
df_merged['chapter_tag'] = df_merged['para_x'].apply(lambda x: x if 'chapter ' in x.lower() else '')

In [270]:
chapter_dict = dict()
chapter_tag = ''
for i in df_merged.index:
    curr_chapter_tag = df_merged.loc[i]['chapter_tag']
    if curr_chapter_tag == '':
        if chapter_tag != '':
            chapter_dict[i] = chapter_tag
        else:
            chapter_tag = curr_chapter_tag
            chapter_dict[i] = chapter_tag
    else: 
        if chapter_tag == curr_chapter_tag:
            chapter_dict[i] = chapter_tag
        else:
            chapter_tag = curr_chapter_tag
            chapter_dict[i] = chapter_tag



In [271]:
df_merged['new_chapter_tag'] = list(x[1] for x in chapter_dict.items())

In [272]:
def is_utterance(sent):
    return get_num_quotes(sent)%2==0


def tag_tokens(sent_list):
    new_list = []
    for sent in sent_list:
        if is_utterance(sent):
            new_list.append((sent, ))

In [273]:
new_label_dict = dict()
count = 0 
for j in df_merged.index:
    tuple_i_want = (j,) \
                + tuple(list(df_merged.loc[j][['position', 'speaker', 'new_type_tag', 'new_chapter_tag']].values)) \
                + (tokenize_para_b(df_merged.loc[j]['para_x']),)
    new_label_dict[count] = tuple_i_want
    count += 1     

In [274]:
df_final = pd.DataFrame(list(new_label_dict.items()), columns=['sent_index', 'tag'])

In [275]:
df_final['para_index'] = df_final['tag'].apply(lambda x: x[0])
df_final['position_tag'] = df_final['tag'].apply(lambda x: x[1])
df_final['speaker_tag'] = df_final['tag'].apply(lambda x: x[2])
df_final['type_tag'] = df_final['tag'].apply(lambda x: x[3])
df_final['chapter_tag'] = df_final['tag'].apply(lambda x: x[4])
df_final['tokenized_sent'] = df_final['tag'].apply(lambda x: x[5])


In [276]:
df_final.head()

Unnamed: 0,sent_index,tag,para_index,position_tag,speaker_tag,type_tag,chapter_tag,tokenized_sent
0,0,"(0, n, , narrative, , By Jane Austen)",0,n,,narrative,,By Jane Austen
1,1,"(1, n, , narrative, Chapter 1, Chapter 1)",1,n,,narrative,Chapter 1,Chapter 1
2,2,"(2, n, , narrative, Chapter 1, It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.)",2,n,,narrative,Chapter 1,"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife."
3,3,"(3, n, , narrative, Chapter 1, However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.)",3,n,,narrative,Chapter 1,"However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."
4,4,"(4, 1, mother, c_1, Chapter 1, ""My dear Mr Bennet,"" said his lady to him one day, ""have you heard that Netherfield Park is let at last?"")",4,1,mother,c_1,Chapter 1,"""My dear Mr Bennet,"" said his lady to him one day, ""have you heard that Netherfield Park is let at last?"""


In [277]:
new_label_dict = dict()
count = 0 
for j in df_merged.index:
    tuple_i_want = (j,) \
                + tuple(list(df_merged.loc[j][['position', 'speaker', 'new_type_tag', 'new_chapter_tag']].values)) \
                + (tokenize_para(df_merged.loc[j]['para_x']),)
    new_label_dict[count] = tuple_i_want
    count += 1     

In [278]:
df_final_1 = pd.DataFrame(list(new_label_dict.items()), columns=['sent_index', 'tag'])
df_final_1['para_index'] = df_final_1['tag'].apply(lambda x: x[0])
df_final_1['position_tag'] = df_final_1['tag'].apply(lambda x: x[1])
df_final_1['speaker_tag'] = df_final_1['tag'].apply(lambda x: x[2])
df_final_1['type_tag'] = df_final_1['tag'].apply(lambda x: x[3])
df_final_1['chapter_tag'] = df_final_1['tag'].apply(lambda x: x[4])
df_final_1['tokenized_sent'] = df_final_1['tag'].apply(lambda x: x[5])

In [279]:
index_to_exclude = df_final_1[df_final_1['tokenized_sent'].apply(lambda x: x=='*')].index

In [280]:
df_final_1 = df_final_1.drop(index_to_exclude, axis=0)

In [281]:
# df_final = df_final.reset_index()
# del df_final['index']

In [282]:
df_final_1.shape

(1290, 8)

In [283]:
df_final.shape

(2119, 8)

In [284]:
label_dict = dict()

conver_tag = ''
position_tag = 'n'

for i in df_final.index:
    curr_position_tag = df_final.loc[i]['position_tag']
    curr_conver_tag = df_final.loc[i]['type_tag']
    if (curr_conver_tag == conver_tag) & (curr_position_tag!='n'):
        if curr_position_tag == position_tag:
            label_dict[i] = 'P'
            position_tag = curr_position_tag
            conver_tag = curr_conver_tag
        else:
            label_dict[i] = 'R'
            position_tag = curr_position_tag
            conver_tag = curr_conver_tag
    else: 
        label_dict[i] = 'O'
        position_tag = curr_position_tag
        conver_tag = curr_conver_tag    

In [285]:
df_final['label'] = list(x[1] for x in label_dict.items())

In [286]:
label_dict = dict()

conver_tag = ''
position_tag = 'n'

for i in df_final_1.index:
    curr_position_tag = df_final_1.loc[i]['position_tag']
    curr_conver_tag = df_final_1.loc[i]['type_tag']
    if (curr_conver_tag == conver_tag) & (curr_position_tag!='n'):
        if curr_position_tag == position_tag:
            label_dict[i] = 'P'
            position_tag = curr_position_tag
            conver_tag = curr_conver_tag
        else:
            label_dict[i] = 'R'
            position_tag = curr_position_tag
            conver_tag = curr_conver_tag
    else: 
        label_dict[i] = 'O'
        position_tag = curr_position_tag
        conver_tag = curr_conver_tag    

In [287]:
df_final_1['label'] = list(x[1] for x in label_dict.items())

In [288]:
df_final_1.head(100)

Unnamed: 0,sent_index,tag,para_index,position_tag,speaker_tag,type_tag,chapter_tag,tokenized_sent,label
4,4,"(4, 1, mother, c_1, Chapter 1, ""My dear Mr Bennet,"" * ""have you heard that Netherfield Park is let at last?"")",4,1,mother,c_1,Chapter 1,"""My dear Mr Bennet,"" * ""have you heard that Netherfield Park is let at last?""",O
6,6,"(6, 1, mother, c_1, Chapter 1, ""But it is,"" * ""for Mrs Long has just been here, and she told me all about it."")",6,1,mother,c_1,Chapter 1,"""But it is,"" * ""for Mrs Long has just been here, and she told me all about it.""",P
8,8,"(8, 1, mother, c_1, Chapter 1, ""Do you not want to know who has taken it?"" *)",8,1,mother,c_1,Chapter 1,"""Do you not want to know who has taken it?"" *",P
9,9,"(9, 2, father, c_1, Chapter 1, ""You want to tell me, and I have no objection to hearing it."")",9,2,father,c_1,Chapter 1,"""You want to tell me, and I have no objection to hearing it.""",R
11,11,"(11, 1, , c_1, Chapter 1, ""Why, my dear, you must know, Mrs Long says that Netherfield is taken by a young man of large fortune from the north of England; that he came down on Monday in a chaise and four to see the place, and was so much delighted with it, that he agreed with Mr Morris immediately; that he is to take possession before Michaelmas, and some of his servants are to be in the house by the end of next week."")",11,1,,c_1,Chapter 1,"""Why, my dear, you must know, Mrs Long says that Netherfield is taken by a young man of large fortune from the north of England; that he came down on Monday in a chaise and four to see the place, and was so much delighted with it, that he agreed with Mr Morris immediately; that he is to take possession before Michaelmas, and some of his servants are to be in the house by the end of next week.""",R
12,12,"(12, 2, , c_1, Chapter 1, ""What is his name?"")",12,2,,c_1,Chapter 1,"""What is his name?""",R
13,13,"(13, 1, , c_1, Chapter 1, ""Bingley."")",13,1,,c_1,Chapter 1,"""Bingley.""",R
14,14,"(14, 2, , c_1, Chapter 1, ""Is he married or single?"")",14,2,,c_1,Chapter 1,"""Is he married or single?""",R
15,15,"(15, 1, , c_1, Chapter 1, ""Oh! Single, my dear, to be sure! A single man of large fortune; four or five thousand a year. What a fine thing for our girls!"")",15,1,,c_1,Chapter 1,"""Oh! Single, my dear, to be sure! A single man of large fortune; four or five thousand a year. What a fine thing for our girls!""",R
16,16,"(16, 2, , c_1, Chapter 1, ""How so? How can it affect them?"")",16,2,,c_1,Chapter 1,"""How so? How can it affect them?""",R


In [289]:
df_final_1 = df_final_1[['sent_index','tokenized_sent','label']]

In [290]:
df_final = pd.merge(df_final, df_final_1, on='sent_index', how='left')

In [291]:
df_final.shape

(2119, 11)

In [292]:
df_final.head(100)

Unnamed: 0,sent_index,tag,para_index,position_tag,speaker_tag,type_tag,chapter_tag,tokenized_sent_x,label_x,tokenized_sent_y,label_y
0,0,"(0, n, , narrative, , By Jane Austen)",0,n,,narrative,,By Jane Austen,O,,
1,1,"(1, n, , narrative, Chapter 1, Chapter 1)",1,n,,narrative,Chapter 1,Chapter 1,O,,
2,2,"(2, n, , narrative, Chapter 1, It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.)",2,n,,narrative,Chapter 1,"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.",O,,
3,3,"(3, n, , narrative, Chapter 1, However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.)",3,n,,narrative,Chapter 1,"However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.",O,,
4,4,"(4, 1, mother, c_1, Chapter 1, ""My dear Mr Bennet,"" said his lady to him one day, ""have you heard that Netherfield Park is let at last?"")",4,1,mother,c_1,Chapter 1,"""My dear Mr Bennet,"" said his lady to him one day, ""have you heard that Netherfield Park is let at last?""",O,"""My dear Mr Bennet,"" * ""have you heard that Netherfield Park is let at last?""",O
5,5,"(5, n, , narrative, Chapter 1, Mr Bennet replied that he had not.)",5,n,,narrative,Chapter 1,Mr Bennet replied that he had not.,O,,
6,6,"(6, 1, mother, c_1, Chapter 1, ""But it is,"" returned she; ""for Mrs Long has just been here, and she told me all about it."")",6,1,mother,c_1,Chapter 1,"""But it is,"" returned she; ""for Mrs Long has just been here, and she told me all about it.""",O,"""But it is,"" * ""for Mrs Long has just been here, and she told me all about it.""",P
7,7,"(7, n, , narrative, Chapter 1, Mr Bennet made no answer.)",7,n,,narrative,Chapter 1,Mr Bennet made no answer.,O,,
8,8,"(8, 1, mother, c_1, Chapter 1, ""Do you not want to know who has taken it?"" cried his wife impatiently.)",8,1,mother,c_1,Chapter 1,"""Do you not want to know who has taken it?"" cried his wife impatiently.",O,"""Do you not want to know who has taken it?"" *",P
9,9,"(9, 2, father, c_1, Chapter 1, ""You want to tell me, and I have no objection to hearing it."")",9,2,father,c_1,Chapter 1,"""You want to tell me, and I have no objection to hearing it.""",R,"""You want to tell me, and I have no objection to hearing it.""",R


In [293]:
df_final['label'] = df_final.apply(lambda row: row['label_y'] if row['label_y']=='P' else row['label_x'], axis=1)

In [294]:
df_final.head(100)

Unnamed: 0,sent_index,tag,para_index,position_tag,speaker_tag,type_tag,chapter_tag,tokenized_sent_x,label_x,tokenized_sent_y,label_y,label
0,0,"(0, n, , narrative, , By Jane Austen)",0,n,,narrative,,By Jane Austen,O,,,O
1,1,"(1, n, , narrative, Chapter 1, Chapter 1)",1,n,,narrative,Chapter 1,Chapter 1,O,,,O
2,2,"(2, n, , narrative, Chapter 1, It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.)",2,n,,narrative,Chapter 1,"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.",O,,,O
3,3,"(3, n, , narrative, Chapter 1, However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.)",3,n,,narrative,Chapter 1,"However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters.",O,,,O
4,4,"(4, 1, mother, c_1, Chapter 1, ""My dear Mr Bennet,"" said his lady to him one day, ""have you heard that Netherfield Park is let at last?"")",4,1,mother,c_1,Chapter 1,"""My dear Mr Bennet,"" said his lady to him one day, ""have you heard that Netherfield Park is let at last?""",O,"""My dear Mr Bennet,"" * ""have you heard that Netherfield Park is let at last?""",O,O
5,5,"(5, n, , narrative, Chapter 1, Mr Bennet replied that he had not.)",5,n,,narrative,Chapter 1,Mr Bennet replied that he had not.,O,,,O
6,6,"(6, 1, mother, c_1, Chapter 1, ""But it is,"" returned she; ""for Mrs Long has just been here, and she told me all about it."")",6,1,mother,c_1,Chapter 1,"""But it is,"" returned she; ""for Mrs Long has just been here, and she told me all about it.""",O,"""But it is,"" * ""for Mrs Long has just been here, and she told me all about it.""",P,P
7,7,"(7, n, , narrative, Chapter 1, Mr Bennet made no answer.)",7,n,,narrative,Chapter 1,Mr Bennet made no answer.,O,,,O
8,8,"(8, 1, mother, c_1, Chapter 1, ""Do you not want to know who has taken it?"" cried his wife impatiently.)",8,1,mother,c_1,Chapter 1,"""Do you not want to know who has taken it?"" cried his wife impatiently.",O,"""Do you not want to know who has taken it?"" *",P,P
9,9,"(9, 2, father, c_1, Chapter 1, ""You want to tell me, and I have no objection to hearing it."")",9,2,father,c_1,Chapter 1,"""You want to tell me, and I have no objection to hearing it.""",R,"""You want to tell me, and I have no objection to hearing it.""",R,R


In [304]:
df_final_utter = df_final[pd.notnull(df_final["tokenized_sent_y"]) ]

In [305]:
def tokenize_and_rm_wp(text_string):
    from nltk.tokenize import word_tokenize
    
    text_string = re.sub("\s+", ' ',text_string, flags=re.UNICODE)
    text_string = text_string.replace('``', '"')
    text_string = text_string.replace("''", '"')
#     return text_string.split(' ')
    tokens = word_tokenize(text_string)
    clean_tokens = [x.replace('``', '"').replace("''", '"') for x in tokens]
    return clean_tokens

In [306]:
df_final.loc[6]['tokenized_sent_y']

'"But it is," * "for Mrs Long has just been here, and she told       me all about it."'

In [307]:
tokenize_and_rm_wp(df_final.loc[6]['tokenized_sent_y'])

['"',
 'But',
 'it',
 'is',
 ',',
 '"',
 '*',
 '"',
 'for',
 'Mrs',
 'Long',
 'has',
 'just',
 'been',
 'here',
 ',',
 'and',
 'she',
 'told',
 'me',
 'all',
 'about',
 'it',
 '.',
 '"']

In [311]:
list(range(1, df_final_utter.shape[0]+1))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [312]:
df_final_utter['new_sent_index'] = list(range(1, df_final_utter.shape[0]+1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [314]:

tokenized_list = []
for i in df_final_utter.index:
    token_list = tokenize_and_rm_wp(df_final_utter.loc[i]['tokenized_sent_y'])
    label_i_want = df_final_utter.loc[i]['label']
    chapter_tag = df_final_utter.loc[i]['chapter_tag']
    sent_index = df_final_utter.loc[i]['new_sent_index']
    for _, x in enumerate(token_list):
        if label_i_want != 'O':
            if _ == 0:
                tokenized_list.append((sent_index,x, "B-"+label_i_want, chapter_tag))
            else:
                tokenized_list.append((sent_index, x, "I-"+label_i_want, chapter_tag))
        else:
            tokenized_list.append((sent_index, x, label_i_want, chapter_tag))

In [315]:
len(tokenized_list)

62619

In [316]:
tokenized_list[0:100]

[(1, '"', 'O', 'Chapter 1'),
 (1, 'My', 'O', 'Chapter 1'),
 (1, 'dear', 'O', 'Chapter 1'),
 (1, 'Mr', 'O', 'Chapter 1'),
 (1, 'Bennet', 'O', 'Chapter 1'),
 (1, ',', 'O', 'Chapter 1'),
 (1, '"', 'O', 'Chapter 1'),
 (1, '*', 'O', 'Chapter 1'),
 (1, '"', 'O', 'Chapter 1'),
 (1, 'have', 'O', 'Chapter 1'),
 (1, 'you', 'O', 'Chapter 1'),
 (1, 'heard', 'O', 'Chapter 1'),
 (1, 'that', 'O', 'Chapter 1'),
 (1, 'Netherfield', 'O', 'Chapter 1'),
 (1, 'Park', 'O', 'Chapter 1'),
 (1, 'is', 'O', 'Chapter 1'),
 (1, 'let', 'O', 'Chapter 1'),
 (1, 'at', 'O', 'Chapter 1'),
 (1, 'last', 'O', 'Chapter 1'),
 (1, '?', 'O', 'Chapter 1'),
 (1, '"', 'O', 'Chapter 1'),
 (2, '"', 'B-P', 'Chapter 1'),
 (2, 'But', 'I-P', 'Chapter 1'),
 (2, 'it', 'I-P', 'Chapter 1'),
 (2, 'is', 'I-P', 'Chapter 1'),
 (2, ',', 'I-P', 'Chapter 1'),
 (2, '"', 'I-P', 'Chapter 1'),
 (2, '*', 'I-P', 'Chapter 1'),
 (2, '"', 'I-P', 'Chapter 1'),
 (2, 'for', 'I-P', 'Chapter 1'),
 (2, 'Mrs', 'I-P', 'Chapter 1'),
 (2, 'Long', 'I-P', 'Chapter 1'

In [320]:
df_tokenized_iob = pd.DataFrame()

In [321]:
df_tokenized_iob['sent_index'] = [x[0] for x in tokenized_list]
df_tokenized_iob['token'] = [x[1] for x in tokenized_list]

In [322]:
df_tokenized_iob['label'] = [x[2] for x in tokenized_list]
df_tokenized_iob['chapter_tag'] = [x[3] for x in tokenized_list]

In [323]:
df_tokenized_iob.head()

Unnamed: 0,sent_index,token,label,chapter_tag
0,1,"""",O,Chapter 1
1,1,My,O,Chapter 1
2,1,dear,O,Chapter 1
3,1,Mr,O,Chapter 1
4,1,Bennet,O,Chapter 1


chapters  19–26  asthe  test  set,  chapters  27–33  as  the  developmentset, and the remaining 46 chapters as the trainingset

In [324]:
test_set_chapters = ['Chapter {}'.format(x) for x in range(19,27)]
validation_set_chapters = ['Chapter {}'.format(x) for x in range(27,34)]


In [325]:
df_test = df_tokenized_iob[df_tokenized_iob['chapter_tag'].apply(lambda x: True if x in test_set_chapters else False)]

In [326]:
df_validation = df_tokenized_iob[df_tokenized_iob['chapter_tag'].apply(lambda x: True if x in validation_set_chapters else False)]

In [327]:
def custom_train_test_split(field):
    if field in test_set_chapters:
        return 'test'
    elif field in validation_set_chapters:
        return 'validation'
    else:
        return 'train'

In [328]:
df_tokenized_iob['split_tag'] = df_tokenized_iob['chapter_tag'].apply(lambda x: custom_train_test_split(x))

In [329]:
df_tokenized_iob['split_tag'].value_counts()

train         48615
test          8929 
validation    5075 
Name: split_tag, dtype: int64

In [1423]:
# df_final['tokenized_sent_clean'] = df_final['tokenized_sent'].apply(lambda x: re.sub('"', '', x))

In [330]:
df_tokenized_iob.head(50)

Unnamed: 0,sent_index,token,label,chapter_tag,split_tag
0,1,"""",O,Chapter 1,train
1,1,My,O,Chapter 1,train
2,1,dear,O,Chapter 1,train
3,1,Mr,O,Chapter 1,train
4,1,Bennet,O,Chapter 1,train
5,1,",",O,Chapter 1,train
6,1,"""",O,Chapter 1,train
7,1,*,O,Chapter 1,train
8,1,"""",O,Chapter 1,train
9,1,have,O,Chapter 1,train


In [331]:
df_tokenized_iob.to_csv('../../data/parsed-n-labeled-data/iob-labeled-tokens-final-010519.csv')