### Dataset preprocessing
Process the text to extract utterances and non-utterances and match the samples with the labelled dataset

In [1]:
import re
import pickle

In [2]:
with open('corpus/PRIDPREJ_NONEWLINE_Organize_v2.txt', 'r+') as raw_text_file:
    # go through all lines in the book
    text = raw_text_file.read()

In [3]:
annotations = []
is_utterance = False
processed = ""
source = ""
sample_parts = []
text = re.sub(' +', ' ', " "+text) 
parts = list(p for p in re.split("(``)|('')", text) if p is not None)
i = 0
next_quote_doesnt_count = False
while i < len(parts):
    part = parts[i]
    if part == '``' or part == "''":
        is_utterance = part == '``'
        source += part
        i += 1
        continue
    if not is_utterance:
        if "\n\n" in part: # before or after an utterance
            lines = part.split("\n\n")
            if processed != "":
                if lines[0] != "":
                    sample_parts.append((lines[0], False))
                source += lines[0]
                if processed[-5:] == " [X] ":
                    processed = processed[:-5]
                if processed != "":
                    annotations.append({
                        "only_utterance_us": processed,
                        "source": source,
                        "parts": sample_parts
                    })
            processed = ""
            if lines[-1] != "":
                sample_parts = [(lines[-1], False)]
            else:
                sample_parts = []
            source = lines[-1]
        else: # in the middle of an utterance
            sample_parts.append((part, False))
            source += part
            if part != " -- ":
                if processed[-5:] != " [X] ":
                    processed += " [X] "
            else:
                processed += " "
    else:
        sample_parts.append((part, True))
        monoline = " ".join(part.split("\n\n"))
        processed += monoline
        source += monoline
    i += 1

Match the annotated dataset with the re-processed dataset

In [4]:
processed_to_index = {annotation["only_utterance_us"]: i for i, annotation in reversed(list(enumerate(annotations)))}

In [5]:
def strip_equal(a, b, l):
    return re.sub(r'(\[X\])|\s', '', a)[:l] == re.sub(r'(\[X\])|\s', '', b)[:l]

In [6]:
with open('corpus/REAL_ALL_CONTENTS_PP.txt') as annoted_text_file:
    annotated_text_lines = annoted_text_file.readlines()
    for annoted_line in annotated_text_lines:
        annotation_i, label, utterance = annoted_line.split('\t')
        utterance = re.sub('\s+', ' ', utterance.strip())
        if utterance in processed_to_index and "target" not in annotations[processed_to_index[utterance]]:
            annotation = annotations[processed_to_index[utterance]]
        else:
            annotation = next((a for a in annotations if strip_equal(a['only_utterance_us'], utterance, 100) and "target" not in a), None)
            if annotation['only_utterance_us'] != utterance:
                print(annotation['only_utterance_us'])
                print("--")
        assert "target" not in annotation
        annotation["only_utterance_article"] = utterance
        annotation["target"] = label

My dear Miss Elizabeth, I have the highest opinion in the world of your excellent judgment in all matters within the scope of your understanding, but permit me to say that there must be a wide difference between the established forms of ceremony amongst the laity, and those which regulate the clergy; for give me leave to observe that I consider the clerical office as equal in point of dignity with the highest rank in the kingdom -- provided that a proper humility of behaviour is at the same time maintained. You must therefore allow me to follow the dictates of my conscience on this occasion, which leads me to perform what I look on as a point of duty. Pardon me for neglecting to profit by your advice, which on every other subject shall be my constant guide, though in the case before us I consider myself more fitted by education and habitual study to decide on what is right than a young lady like yourself. [X] apology, [X] Hunsford, [X] Lady Catherine de Bourgh.
--
delightful, [X] charm

In [7]:
for a in annotations:
    if "target" not in a:
        print(a)

In [8]:
import pickle

In [9]:
pickle.dump(annotations, open("corpus/dataset.pkl", "wb"))