### Dataset preprocessing
Process the text to extract utterances and non-utterances and match the samples with the labelled dataset

In [1]:
import re
import pickle

#### Book utterances and non-utterances extraction

In [2]:
from src.character import characters
from src.curation import curation

curation(characters)

In [3]:
with open('corpus/curated_text.txt', 'r+') as raw_text_file:
    # go through all lines in the book
    text = raw_text_file.read()

In [4]:
annotations = []
is_utterance = False
processed = ""
source = ""
sample_parts = []
text = re.sub(' +', ' ', " "+text) 
parts = list(p for p in re.split("(``)|('')", text) if p is not None)
i = 0
next_quote_doesnt_count = False
while i < len(parts):
    part = parts[i]
    if part == '``' or part == "''":
        is_utterance = part == '``'
        source += part
        i += 1
        continue
    if not is_utterance:
        if "\n\n" in part: # before or after an utterance
            lines = part.split("\n\n")
            if processed != "":
                if lines[0] != "":
                    sample_parts.append({"text": lines[0], "utterance": False})
                source += lines[0]
                if processed[-5:] == " [X] ":
                    processed = processed[:-5]
                if processed != "":
                    annotations.append({
                        "only_utterance_us": processed,
                        "source": source,
                        "parts": sample_parts
                    })
            processed = ""
            if lines[-1] != "":
                sample_parts = [({"text": lines[-1], "utterance": False})]
            else:
                sample_parts = []
            source = lines[-1]
        else: # in the middle of an utterance
            sample_parts.append({"text": part, "utterance": False})
            source += part
            if part != " -- ":
                if processed[-5:] != " [X] ":
                    processed += " [X] "
            else:
                processed += " "
    else:
        sample_parts.append({"text": part, "utterance": True})
        monoline = " ".join(part.split("\n\n"))
        processed += monoline
        source += monoline
    i += 1

#### Match the annotated dataset with the re-processed dataset

In [5]:
processed_to_index = {annotation["only_utterance_us"]: i for i, annotation in reversed(list(enumerate(annotations)))}

In [6]:
def strip_equal(a, b, l):
    return re.sub(r'(\[X\])|\s', '', a)[:l] == re.sub(r'(\[X\])|\s', '', b)[:l]

In [7]:
with open('corpus/curated_dialogs.txt') as annoted_text_file:
    annotated_text_lines = annoted_text_file.readlines()
    for annoted_line in annotated_text_lines:
        annotation_i, label, utterance = annoted_line.split('\t')
        utterance = re.sub('\s+', ' ', utterance.strip())
        if utterance in processed_to_index and "target" not in annotations[processed_to_index[utterance]]:
            annotation = annotations[processed_to_index[utterance]]
        else:
            annotation = next((a for a in annotations if strip_equal(a['only_utterance_us'], utterance, 100) and "target" not in a), None)
            if annotation['only_utterance_us'] != utterance:
                print(annotation['only_utterance_us'])
                print("--")
        assert "target" not in annotation
        annotation["only_utterance_article"] = utterance
        annotation["target"] = label

My dear Elizabeth, I have the highest opinion in the world of your excellent judgment in all matters within the scope of your understanding, but permit me to say that there must be a wide difference between the established forms of ceremony amongst the laity, and those which regulate the clergy; for give me leave to observe that I consider the clerical office as equal in point of dignity with the highest rank in the kingdom -- provided that a proper humility of behaviour is at the same time maintained. You must therefore allow me to follow the dictates of my conscience on this occasion, which leads me to perform what I look on as a point of duty. Pardon me for neglecting to profit by your advice, which on every other subject shall be my constant guide, though in the case before us I consider myself more fitted by education and habitual study to decide on what is right than a young lady like yourself. [X] apology, [X] Hunsford, [X] Lady Catherine de Bourgh.
--
delightful, [X] charming,


In [8]:
for a in annotations:
    if "target" not in a:
        print(a)

#### Stanford parser annotations

Load the stanford parser

In [9]:
import nltk
from nltk.parse import stanford

jar = 'stanford-parser-full-2017-06-09/stanford-parser.jar'
model = 'stanford-parser-full-2017-06-09/stanford-english-corenlp-2017-06-09-models.jar'

dep_parser = stanford.StanfordDependencyParser(model, jar, model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8')

Add standford parser annotations like name or gender

In [10]:
#######################################################
# Stanford parser rules: triples -> name, gender, etc #
#######################################################
def extract_features(speaker_name, speaker_gender, triple):
    (word1, tag1), dep, (word2, tag2) = triple
    if tag1.startswith('VB') and (tag2.startswith('NN') or tag2.startswith('PRP')):
        if tag2.startswith('NNP'):
            speaker_name = word2
        if word2 in ['he', 'husband', 'man', 'father', 'son']:
            speaker_gender = 'M'
        if word2 in ['she', 'wife', 'lady', 'mother', 'daughter']:
            speaker_gender = 'F'
    return (speaker_name, speaker_gender)


for sample in annotations:
    for part in sample["parts"]:
        if not part["utterance"]:
            speaker_name = None
            speaker_gender = None
            tokens = nltk.word_tokenize(part["text"][:200])
            tagged = nltk.pos_tag(tokens)
            try:
                dependencies = sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents([tagged])],[])
                #print(dependencies)
                for (term1,dep,term2) in dependencies[0]:
                    speaker_name, speaker_gender = extract_features(speaker_name, speaker_gender, (term1,dep,term2) )
                    # try reverse order
                    speaker_name, speaker_gender =  extract_features(speaker_name, speaker_gender, (term2,dep,term1) )
                    break
            except Exception as e:
                    print(e)
                    print(part)
            part["speaker_name"] = speaker_name
            part["speaker_gender"] = speaker_gender

'NoneType' object is not subscriptable
{'text': ' -- ', 'utterance': False}
'NoneType' object is not subscriptable
{'text': ' -- ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
'NoneType' object is not subscriptable
{'text': ' --', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}
list index out of range
{'text': ' ', 'utterance': False}


In [12]:
print(len(annotations))

1294


In [13]:
annotations[:20]

[{'only_utterance_article': 'My dear Bennet, [X] have you heard that Netherfield Park is let at last?',
  'only_utterance_us': 'My dear Bennet, [X] have you heard that Netherfield Park is let at last?',
  'parts': [{'text': 'My dear Bennet,', 'utterance': True},
   {'speaker_gender': 'F',
    'speaker_name': None,
    'text': ' said his lady to him one day, ',
    'utterance': False},
   {'text': 'have you heard that Netherfield Park is let at last?',
    'utterance': True}],
  'source': "``My dear Mr. Bennet,'' said his lady to him one day, ``have you heard that Netherfield Park is let at last?''",
  'target': 'Mrs. Bennet'},
 {'only_utterance_article': 'But it is, [X] for Mrs. Long has just been here, and she told me all about it.',
  'only_utterance_us': 'But it is, [X] for Mrs. Long has just been here, and she told me all about it.',
  'parts': [{'text': 'But it is,', 'utterance': True},
   {'speaker_gender': 'F',
    'speaker_name': None,
    'text': ' returned she; ',
    'uttera

Dump the annotated dataset

In [14]:
pickle.dump(annotations, open("corpus/dataset.pkl", "wb"))

In [15]:
# To facilitate comparison with curated_dialogs.txt
with open('corpus/dataset.pkl', 'rb') as pick:
    with open('corpus/dataset.txt', 'w+') as text:
        annotations = pickle.load(pick)
        for a in annotations:
            parts = " ".join(re.sub(r'\n', ' ', part["text"]) for part in a["parts"])
            text.write(parts[:200] + "\n")