# Add quote context column

In [1]:
# Load data with lxml
from lxml import etree
pp_fpath = '/home/mamille2/storyq/quoteli/pp_full.xml'
tree = etree.parse(pp_fpath)
root = tree.getroot()
text = root[1]

In [2]:
# Load table of quotes
import pandas as pd
quote_path = '/home/mamille2/storyq/quoteli/pride_prejudice_quotes.csv'
quote_df = pd.read_csv(quote_path)
quote_df

Unnamed: 0,quote_id,speaker,quote
0,s0,Mrs_Bennet,"""My dear Mr. Bennet,"""
1,s1,Mrs_Bennet,"""have you heard that Netherfield Park is let a..."
2,s2,Mrs_Bennet,"""But it is,"""
3,s3,Mrs_Bennet,"""for Mrs. Long has just been here, and she tol..."
4,s4,Mrs_Bennet,"""Do not you want to know who has taken it?"""
...,...,...,...
1570,s2350,Mr_Darcy,"""I am more likely to want more time than coura..."
1571,s2351,Elizabeth_Bennet,"""And if I had not a letter to write myself, I ..."
1572,s2352,Elizabeth_Bennet,"""I would have thanked you before, my dear aunt..."
1573,s2353,Mr_Bennet,"""DEAR SIR,\n\nI must trouble you once more for..."


In [59]:
import pdb
import re

def get_text_before(element):
    preceding_quotes = element.xpath("preceding-sibling::quote")
    if preceding_quotes:
        preceding_quote = preceding_quotes[-1]
        mentions1 = preceding_quote.xpath("following-sibling::mention")
        mentions2 = element.xpath("preceding-sibling::mention")
        preceding_mentions = set(mentions1).intersection(set(mentions2))
        text_before = preceding_quote.tail + ' '.join([m.text + m.tail for m in preceding_mentions])
    else: # first quote
        preceding_mentions = element.xpath("preceding-sibling::mention")
        text_before = element.getparent().text + ' '.join([m.text + m.tail for m in preceding_mentions])
    return last_sentence(text_before.strip())

def get_text_after(element):
    following_quotes = element.xpath("following-sibling::quote")
    if following_quotes:
        following_quote = following_quotes[0]
        mentions1 = element.xpath("following-sibling::mention")
        mentions2 = following_quote.xpath("preceding-sibling::mention")
        following_mentions = set(mentions1).intersection(set(mentions2))
        text_after = element.tail + ' '.join([m.text + m.tail for m in following_mentions])
    else: # last quote
        following_mentions = element.xpath("following-sibling::mention")
        text_after = element.tail + ' '.join([m.text + m.tail for m in following_mentions])
    return first_sentence(text_after.strip())

def first_sentence(text):
    sentences = [sent for sent in re.split(r'(?<!\bMr|\bMs)(?<!Mrs)[\.\?!](?:\s|$)', text) if sent != '']
    if len(sentences) == 0:
        return ''
    else:
        return sentences[0].replace('\n', '').strip()

def last_sentence(text):
    sentences = [sent for sent in re.split(r'(?<!\bMr|\bMs)(?<!Mrs)[\.\?!](?:\s|$)', text) if sent != '']
    if len(sentences) == 0:
        return ''
    else:
        return sentences[-1].replace('\n', '').strip()

In [60]:
last_sentence("Elizabeth had never yet answered Mrs. Gardiner's long letter")

"Elizabeth had never yet answered Mrs. Gardiner's long letter"

In [62]:
# Find context of quotes
quote_id = 's2359'
# quote_id = 's0'
quote = text.find(f"quote[@id='{quote_id}']")
# print(quote.text)
# print(quote.tail)

print(get_text_before(quote))
print('\n***************************\n')
print(get_text_after(quote))

The letter was to this effect:

***************************

As it happened that Elizabeth had _much_ rather not, she endeavoured in her answer to put an end to every intreaty and expectation of the kind


In [50]:
# Remove pronouns and characters from the text
import re

pronouns = ['he', 'him', 'his', 'she', 'her',  'hers']
stops = ['Miss', "Mr", 'lady', 'wife', 'husband']
character_aliases = [alias for aliases in root.xpath('./characters/character/@aliases') for alias in aliases.split(';')]

def remove_pronouns_names(text):
    new_text = text
    remove_list = pronouns + stops + character_aliases
    for w in remove_list:
        new_text = re.sub(r'\b{}\b'.format(w), '', new_text, flags=re.IGNORECASE)
    return new_text

In [44]:
# Add context columns
def context(quote_id):
    quote = text.find(f"quote[@id='{quote_id}']")
    left_context = remove_pronouns_names(get_text_before(quote))
    right_context = remove_pronouns_names(get_text_after(quote))
    return left_context, right_context

In [61]:
from tqdm.notebook import tqdm
contexts = [context(q) for q in tqdm(quote_df['quote_id'].tolist())]
quote_df['left_context'], quote_df['right_context'] = list(zip(*contexts))
quote_df

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1575.0), HTML(value='')))




Unnamed: 0,quote_id,speaker,quote,left_context,right_context
0,s0,Mrs_Bennet,"""My dear Mr. Bennet,""",However little known the feelings or views of ...,"said to one day,"
1,s1,Mrs_Bennet,"""have you heard that Netherfield Park is let a...","said to one day,",. replied that had not
2,s2,Mrs_Bennet,"""But it is,""",. replied that had not,returned ;
3,s3,Mrs_Bennet,"""for Mrs. Long has just been here, and she tol...",returned ;,. made no answer
4,s4,Mrs_Bennet,"""Do not you want to know who has taken it?""",. made no answer,cried impatiently
...,...,...,...,...,...
1570,s2350,Mr_Darcy,"""I am more likely to want more time than coura...",,
1571,s2351,Elizabeth_Bennet,"""And if I had not a letter to write myself, I ...",,From an unwillingness to confess how much int...
1572,s2352,Elizabeth_Bennet,"""I would have thanked you before, my dear aunt...",From an unwillingness to confess how much int...,. 's letter to was in a different style; and...
1573,s2353,Mr_Bennet,"""DEAR SIR,\n\nI must trouble you once more for...",. 's letter to was in a different style; and...,"'s congratulations to brother, on approachi..."


In [64]:
# Reduce number of characters (naively)
quote_df['speaker'].value_counts().index

Index(['Elizabeth_Bennet', 'Mrs_Bennet', 'Mr_Darcy', 'Mr_Bennet',
       'Jane_Bennet', 'Caroline_Bingley', 'Lady_Catherine', 'Mr_Bingley',
       'Mr_Wickham', 'Lydia_Bennet', 'Mr_Collins', 'Mrs_Gardiner',
       'Colonel_Fitzwilliam', 'Charlotte', 'Mr_Gardiner', 'Mrs_Reynolds',
       'Sir_William', 'Kitty_Bennet', 'Mary_Bennet', 'Louisa_Hurst',
       'Maria_Lucas', 'Kitty_and_Lydia', 'maria', 'Mrs_Hill', 'NOTANUTTERANCE',
       'a_young_male_Lucas', 'Mr_Hurst', 'Mrs_Phillips', 'one_of_the_girls',
       'The_Butler', 'Mr_Denny', 'Catherine_and_Lydia', 'Elizabeth_and_Jane',
       'UNSURE'],
      dtype='object')

In [66]:
# top 5 characters
top_chars = quote_df['speaker'].value_counts().index[:5]
top_chars

Index(['Elizabeth_Bennet', 'Mrs_Bennet', 'Mr_Darcy', 'Mr_Bennet',
       'Jane_Bennet'],
      dtype='object')

In [67]:
reduced_quote_df = quote_df[quote_df['speaker'].isin(top_chars)]
reduced_quote_df

Unnamed: 0,quote_id,speaker,quote,left_context,right_context
0,s0,Mrs_Bennet,"""My dear Mr. Bennet,""",However little known the feelings or views of ...,"said to one day,"
1,s1,Mrs_Bennet,"""have you heard that Netherfield Park is let a...","said to one day,",. replied that had not
2,s2,Mrs_Bennet,"""But it is,""",. replied that had not,returned ;
3,s3,Mrs_Bennet,"""for Mrs. Long has just been here, and she tol...",returned ;,. made no answer
4,s4,Mrs_Bennet,"""Do not you want to know who has taken it?""",. made no answer,cried impatiently
...,...,...,...,...,...
1569,s2349,Elizabeth_Bennet,"""Shall you ever have courage to announce to La...",,
1570,s2350,Mr_Darcy,"""I am more likely to want more time than coura...",,
1571,s2351,Elizabeth_Bennet,"""And if I had not a letter to write myself, I ...",,From an unwillingness to confess how much int...
1572,s2352,Elizabeth_Bennet,"""I would have thanked you before, my dear aunt...",From an unwillingness to confess how much int...,. 's letter to was in a different style; and...


In [69]:
# Save out
import os
quoteli_dirpath = '/home/mamille2/storyq/quoteli/'
quote_path = os.path.join(quoteli_dirpath, 'pride_prejudice_quotes.csv')
quote_df.to_csv(quote_path, index=False)

reduced_quote_path = os.path.join(quoteli_dirpath, 'pride_prejudice_top5chars_quotes.csv')
reduced_quote_df.to_csv(reduced_quote_path, index=False)

# Verify number of quotes from Pride and Prejudice

In [2]:
# Load extracted table of quotes
import pandas as pd

path = '/home/mamille2/storyq/quoteli/pride_prejudice_quotes.csv'
data = pd.read_csv(path)
data

Unnamed: 0,quote_id,speaker,quote
0,s0,Mrs_Bennet,"""My dear Mr. Bennet,"""
1,s1,Mrs_Bennet,"""have you heard that Netherfield Park is let a..."
2,s2,Mrs_Bennet,"""But it is,"""
3,s3,Mrs_Bennet,"""for Mrs. Long has just been here, and she tol..."
4,s4,Mrs_Bennet,"""Do not you want to know who has taken it?"""
...,...,...,...
1570,s2350,Mr_Darcy,"""I am more likely to want more time than coura..."
1571,s2351,Elizabeth_Bennet,"""And if I had not a letter to write myself, I ..."
1572,s2352,Elizabeth_Bennet,"""I would have thanked you before, my dear aunt..."
1573,s2353,Mr_Bennet,"""DEAR SIR,\n\nI must trouble you once more for..."


In [3]:
# Load quoteli annotations
pp_fpath = '/home/mamille2/storyq/quoteli/pp_full.xml'
with open(pp_fpath, 'r') as f:
    text = f.read()
    
text.count('</quote>')

1575

# Extract table of quotes from Pride and Prejudice

In [2]:
import xml.etree.ElementTree as ET

pp_fpath = '/home/mamille2/storyq/quoteli/pp_full.xml'
tree = ET.parse(pp_fpath)
root = tree.getroot()
root

<Element 'doc' at 0x7f0fa5064bf0>

In [9]:
import pandas as pd

text = root[1]
quotes = [child for child in text if child.tag == 'quote']
outlines = []
for quote in quotes:
    speaker_id = quote.attrib['id']
    speaker = quote.attrib['speaker']
    quote = quote.text
    outlines.append([speaker_id, speaker, quote])
    
df = pd.DataFrame(outlines, columns=['quote_id', 'speaker', 'quote'])
# df.index.rename('quote_id')
df

Unnamed: 0,quote_id,speaker,quote
0,s0,Mrs_Bennet,"""My dear Mr. Bennet,"""
1,s1,Mrs_Bennet,"""have you heard that Netherfield Park is let a..."
2,s2,Mrs_Bennet,"""But it is,"""
3,s3,Mrs_Bennet,"""for Mrs. Long has just been here, and she tol..."
4,s4,Mrs_Bennet,"""Do not you want to know who has taken it?"""
...,...,...,...
1570,s2350,Mr_Darcy,"""I am more likely to want more time than coura..."
1571,s2351,Elizabeth_Bennet,"""And if I had not a letter to write myself, I ..."
1572,s2352,Elizabeth_Bennet,"""I would have thanked you before, my dear aunt..."
1573,s2353,Mr_Bennet,"""DEAR SIR,\n\nI must trouble you once more for..."


In [10]:
# Save out
outpath = '/home/mamille2/storyq/quoteli/pride_prejudice_quotes.csv'
df.to_csv(outpath, index=False)

# Old