### Dataset preprocessing
Process the text to extract utterances and non-utterances and match the samples with the labelled dataset

In [1]:
import re
import pickle

In [2]:
from src.character import Character

characters = []
men = []
women = []

with open('corpus/PeopleList_Revised.txt', 'r+') as character_infos:
    for character_info in character_infos:
        character_info = re.sub('\n', '', character_info).split(';')
        aliases = []
        main_name = None
        gender = None
        for info in character_info:
            if (info in ['M', 'F']):
                gender = info
            else:
                if not main_name:
                    main_name = re.sub(r'[\W_]+' , '_', info)
                aliases.append(info)
        if gender == 'M':
            men.append(Character(main_name, gender, aliases))
        else:
            women.append(Character(main_name, gender, aliases))

# if a man and a woman have the same alias, keep only the man's alias 
# (Bennet is more likely a man than a woman)
men_aliases = sum((man.aliases for man in men), [])
for woman in women:
    for woman_alias in woman.aliases:
        if woman_alias in men_aliases:
            woman.aliases.remove(woman_alias)

# women first, because when we will replace the aliases in the text, 
# we do not want the "Bennet" in "Mrs Bennet" to be replaced by "Mr Bennet"           
characters = women + men

for character in characters:
    print(character)

['Mr. Bennet', 'Bennet', 'Mr. Bingley', 'Bingley', 'Captain Carter', 'Mr. Collins', 'William Collins', 'Mr. Chamberlayne', 'Dawson', 'Mr. Darcy', 'Mr. Fitzwilliam Darcy', 'Fitzwilliam Darcy', 'Darcy', 'Old Mr. Darcy', 'Colonel Fitzwilliam', 'Colonel F.', 'Colonel Forster', 'Mr. Gardiner', 'EDW. Gardiner', 'E. Gardiner', 'William Goulding', 'Haggerston', 'Mr. Jones', 'Mr. Hurst', 'Mr. Morris', 'Mr. Philips', 'Philips', 'Mr. Pratt', 'Pratt', 'Mr. Robinson', 'Mr. Stone', 'Old Mr. Wickham', 'Sir William', 'Sir William Lucas', 'Mr. Wickham', 'George Wickham', 'George', 'Wickham']
name: Mrs_Annesley
gender: F
aliases: ['Mrs. Annesley', 'Annesley']
name: Elizabeth_Bennet
gender: F
aliases: ['Elizabeth Bennet', 'Miss Elizabeth Bennet', 'Miss Elizabeth', 'Miss Lizzy', 'Miss Bennet', 'Miss Eliza', 'Eliza Bennet', 'Elizabeth', 'Lizzy', 'Liz', 'Eliza']
name: Jane_Bennet
gender: F
aliases: ['Jane Bennet', 'Jane']
name: Lydia_Bennet
gender: F
aliases: ['Lydia Bennet', 'Miss Lydia Bennet', 'Miss Lydi

#### Book utterances and non-utterances extraction

In [3]:
#from src.character import characters
from src.curation import curation

curation(characters)

In [4]:
with open('corpus/curated_text.txt', 'r+') as raw_text_file:
    # go through all lines in the book
    text = raw_text_file.read()

In [5]:
annotations = []
is_utterance = False
processed = ""
source = ""
sample_parts = []
text = re.sub(' +', ' ', " "+text) 
parts = list(p for p in re.split("(``)|('')", text) if p is not None)
i = 0
next_quote_doesnt_count = False
while i < len(parts):
    part = parts[i]
    if part == '``' or part == "''":
        is_utterance = part == '``'
        source += part
        i += 1
        continue
    if not is_utterance:
        if "\n\n" in part: # before or after an utterance
            lines = part.split("\n\n")
            if processed != "":
                if lines[0] != "":
                    sample_parts.append({"text": lines[0], "utterance": False})
                source += lines[0]
                if processed[-5:] == " [X] ":
                    processed = processed[:-5]
                if processed != "":
                    annotations.append({
                        "only_utterance_us": processed,
                        "source": source,
                        "parts": sample_parts
                    })
            processed = ""
            if lines[-1] != "":
                sample_parts = [({"text": lines[-1], "utterance": False})]
            else:
                sample_parts = []
            source = lines[-1]
        else: # in the middle of an utterance
            sample_parts.append({"text": part, "utterance": False})
            source += part
            if part != " -- ":
                if processed[-5:] != " [X] ":
                    processed += " [X] "
            else:
                processed += " "
    else:
        sample_parts.append({"text": part, "utterance": True})
        monoline = " ".join(part.split("\n\n"))
        processed += monoline
        source += monoline
    i += 1

#### Match the annotated dataset with the re-processed dataset

In [6]:
processed_to_index = {annotation["only_utterance_us"]: i for i, annotation in reversed(list(enumerate(annotations)))}

In [7]:
def strip_equal(a, b, l):
    return re.sub(r'(\[X\])|\s', '', a)[:l] == re.sub(r'(\[X\])|\s', '', b)[:l]

In [8]:
with open('corpus/curated_dialogs.txt') as annoted_text_file:
    annotated_text_lines = annoted_text_file.readlines()
    for annoted_line in annotated_text_lines:
        annotation_i, label, utterance = annoted_line.split('\t')
        utterance = re.sub('\s+', ' ', utterance.strip())
        if utterance in processed_to_index and "target" not in annotations[processed_to_index[utterance]]:
            annotation = annotations[processed_to_index[utterance]]
        else:
            annotation = next((a for a in annotations if strip_equal(a['only_utterance_us'], utterance, 100) and "target" not in a), None)
            if annotation['only_utterance_us'] != utterance:
                print(annotation['only_utterance_us'])
                print("--")
        assert "target" not in annotation
        annotation["only_utterance_article"] = utterance
        annotation["target"] = label

My dear Elizabeth_Bennet, I have the highest opinion in the world of your excellent judgment in all matters within the scope of your understanding, but permit me to say that there must be a wide difference between the established forms of ceremony amongst the laity, and those which regulate the clergy; for give me leave to observe that I consider the clerical office as equal in point of dignity with the highest rank in the kingdom -- provided that a proper humility of behaviour is at the same time maintained. You must therefore allow me to follow the dictates of my conscience on this occasion, which leads me to perform what I look on as a point of duty. Pardon me for neglecting to profit by your advice, which on every other subject shall be my constant guide, though in the case before us I consider myself more fitted by education and habitual study to decide on what is right than a young lady like yourself. [X] apology, [X] Hunsford, [X] Lady_Catherine de Anne_de_Bourgh.
--
delightful,

In [9]:
for a in annotations:
    if "target" not in a:
        print(a)

#### Stanford parser annotations

Load the stanford parser

In [12]:
import nltk
from nltk.parse import stanford

jar = 'stanford-parser-full-2017-06-09/stanford-parser.jar'
model = 'stanford-parser-full-2017-06-09/stanford-english-corenlp-2017-06-09-models.jar'

dep_parser = stanford.StanfordDependencyParser(model, jar, model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8')

Add standford parser annotations like name or gender

In [13]:
#######################################################
# Stanford parser rules: triples -> name, gender, etc #
#######################################################
def extract_features(speaker_name, speaker_function, speaker_gender, triple):
    (word1, tag1), dep, (word2, tag2) = triple
    if (tag1.startswith('VBD') or word1 in ['said', 'added', 'cried', 'asked', 'replied', 'returned', 'continued', 'observed']) and (tag2.startswith('NN') or tag2.startswith('PRP')) and not dep.startswith('nmod'):
        if tag2.startswith('NN'):
            if tag2.startswith('NNP'):
                speaker_name = word2
            else:
                speaker_function = word2
        if word2 in ['he', 'man', 'boy', 'lad']:
            speaker_gender = 'M'
        if word2 in ['she', 'lady', 'girl']:
            speaker_gender = 'F'
    return (speaker_name, speaker_function, speaker_gender)


for sample in annotations:
    for part in sample["parts"]:
        if not part["utterance"]:
            speaker_name = None
            speaker_function = None
            speaker_gender = None
            tokens = nltk.word_tokenize(part["text"][:200])
            tagged = nltk.pos_tag(tokens)
            try:
                dependencies = sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents([tagged])],[])
                for (term1,dep,term2) in dependencies[0]:
                    #print(term1, dep, term2)
                    speaker_name, speaker_function, speaker_gender = extract_features(speaker_name, speaker_function, speaker_gender, (term1,dep,term2) )
                    # try reverse order
                    speaker_name, speaker_function, speaker_gender =  extract_features(speaker_name, speaker_function, speaker_gender, (term2,dep,term1) )
            except Exception as e:
                    print(e)
                    print(part)
            part["speaker_name"] = speaker_name
            part["speaker_function"] = speaker_function
            part["speaker_gender"] = speaker_gender
            #print(part)

'NoneType' object is not subscriptable
{'text': ' -- ', 'utterance': False}
'NoneType' object is not subscriptable
{'text': ' -- ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
'NoneType' object is not subscriptable
{'text': ' --', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}


In [14]:
print(len(annotations))

1294


In [15]:
annotations[:20]

[{'only_utterance_article': 'My dear Mr_Bennet, [X] have you heard that Netherfield Park is let at last?',
  'only_utterance_us': 'My dear Mr_Bennet, [X] have you heard that Netherfield Park is let at last?',
  'parts': [{'text': 'My dear Mr_Bennet,', 'utterance': True},
   {'speaker_function': 'lady',
    'speaker_gender': 'F',
    'speaker_name': None,
    'text': ' said his lady to him one day, ',
    'utterance': False},
   {'text': 'have you heard that Netherfield Park is let at last?',
    'utterance': True}],
  'source': "``My dear Mr_Bennet,'' said his lady to him one day, ``have you heard that Netherfield Park is let at last?''",
  'target': 'Mrs_Bennet'},
 {'only_utterance_article': 'But it is, [X] for Mrs_Long has just been here, and she told me all about it.',
  'only_utterance_us': 'But it is, [X] for Mrs_Long has just been here, and she told me all about it.',
  'parts': [{'text': 'But it is,', 'utterance': True},
   {'speaker_function': None,
    'speaker_gender': 'F',
 

Dump the annotated dataset

In [16]:
pickle.dump(annotations, open("corpus/dataset.pkl", "wb"))

In [17]:
# To facilitate comparison with curated_dialogs.txt
with open('corpus/dataset.pkl', 'rb') as pick:
    with open('corpus/dataset.txt', 'w+') as text:
        annotations = pickle.load(pick)
        for a in annotations:
            parts = " ".join(re.sub(r'\n', ' ', part["text"]) for part in a["parts"])
            text.write(parts[:200] + "\n")

In [19]:
with open('corpus/dataset.pkl', 'rb') as pick:
    annotations2 = pickle.load(pick)
    print(annotations2[858])

{'only_utterance_us': 'for your housekeeper, [X] informed us that you would certainly not be here till to-morrow; and indeed, before we left Bakewell we understood that you were not immediately expected in the country. [X] They will join me early tomorrow, [X] and among them are some who will claim an acquaintance with you, -- Mr_Bingley and his sisters.', 'source': "After walking some time in this way, the two ladies in front, the two gentlemen behind, on resuming their places after descending to the brink of the river for the better inspection of some curious water-plant, there chanced to be a little alteration. It originated in Mrs_Gardiner, who, fatigued by the exercise of the morning, found Elizabeth_Bennet's arm inadequate to her support, and consequently preferred her husband's. Mr_Darcy took her place by her niece, and they walked on together. After a short silence, the lady first spoke. She wished him to know that she had been assured of his absence before she came to the plac