### Step 0: Initialize knowledge about the characters

In [1]:
from src.character import characters

In [2]:
import re
import nltk
from nltk.parse import stanford

# Add the jar and model via their path (instead of setting environment variables):
#jar = '.\stanford-parser-full-2015-01-30\stanford-parser.jar'
#model = '.\stanford-parser-full-2015-01-30\stanford-parser.jar'

jar = '.\stanford-parser-full-2017-06-09\stanford-parser-3.8.0.jar'
model = '.\stanford-parser-full-2017-06-09\stanford-parser-3.8.0.jar'

parser = stanford.StanfordParser(model, jar, model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8')
dep_parser = stanford.StanfordDependencyParser(model, jar, model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8')


In [3]:
from src.curation import curation
curation(characters)

### Step 1: extracting dialogs and explicit speakers from raw text (deterministic method)

In [4]:
from src.explicit import get_annotated_lines

annotated_lines = get_annotated_lines(parser=parser, dep_parser=dep_parser)


``Why will you think so?  It must be his own doing. -- He is his own master.  But you do not know _all_.  I _will_ read you the passage which particularly hurts me.  I will have no reserves from _you_. [X] Darcy is impatient to see his sister, and to confess the truth, we are scarcely less eager to meet her again.  I really do not think Georgiana Darcy has her equal for beauty, elegance, and accomplishments; and the affection she inspires in Louisa and myself is heightened into something still more interesting, from the hope we dare to entertain of her being hereafter our sister.  I do not know whether I ever before mentioned to you my feelings on this subject, but I will not leave the country without confiding them, and I trust you will not esteem them unreasonable.  My brother admires her greatly already, he will have frequent opportunity now of seeing her on the most intimate footing, her relations all wish the connection as much as his own, and a sister's partiality is not mislead

In [5]:
print(len(annotated_lines))

1283


In [7]:
annotated_lines[1]

(None,
 None,
 "``But it is, [X] for Mrs. Long has just been here, and she told me all about it.''\n")

In [None]:
from nltk.tree import Tree

t = Tree('ROOT', [Tree('SINV', [Tree('VP', [Tree('VBD', ['said'])]), Tree('NP', [Tree('NP', [Tree('PRP$', ['his']), Tree('NN', ['lady'])]), Tree('PP', [Tree('TO', ['to']), Tree('NP', [Tree('NP', [Tree('PRP', ['him'])]), Tree('NP', [Tree('CD', ['one']), Tree('NN', ['day'])]), Tree(',', [','])])])])])])
t.draw()

In [None]:
t = [Tree('ROOT', [Tree('S', [Tree('VP', [Tree('VBG', ['returned']), Tree('NP', [Tree('PRP', ['she'])])]), Tree(':', [';'])])])]

t.draw()


### Step 2: extracting features

In [None]:
from collections import defaultdict # defaultdict(int) return 0 if key not in dict

In [None]:
names = list(characters.keys())

In [None]:
# novel level feature
# TODO: take values on train set
#character_phrases_count = defaultdict(int) # to get speaking frequence of each character

# dialog level feature
character_dialog_mention = {character: [] for character in names} # boolean

# phrase level feature
character_last_phrase = {character: [] for character in names}   # index of last phrase
character_explicit_mention = {character: [] for character in names} #  boolean

# TODO, char by char we'll set 1 if same
# gender, 0 if not and 0.5 if no gender can be supposed
character_supposed_gender = defaultdict(list)   

In [None]:
dialogs = open("./corpus/curated_dialogs.txt", "r").read().split('\n')[:-1]

phrases_count = 0
dialog_index = 0
for phrase, (explicit_speaker, gender, _) in zip(dialogs, annotated_lines):
    phrases_count += 1
    dialog, speaker, text = phrase.split("\t")
    
    if dialog > dialog_index:
        dialog_index = dialog
        for character in names:
            character_dialog_mention[character].append(False)
            character_explicit_mention[character].append([])
            character_last_phrase[character].append(-1)
            character_supposed_gender[character].append([])
            
    for character in names:
        character_explicit_mention[character][-1].append(False)
            
    for character in set(names).intersection(set(text.split(' '))):
        character_dialog_mention[character][-1] = True
        
    if explicit_speaker is not None:
        character_explicit_mention[explicit_speaker][-1][-1] = True
    
    ## Train ##
    # only for training as we use the speaker label, take prediction for valid and test
    
    if gender is None:
        for character in names:       
            character_supposed_gender[character][-1].append(0.5)
    else:
        is_speaker = 1 if gender == characters[speaker].gender else 0
        character_supposed_gender[speaker][-1].append(is_speaker)
        for character in names:
            if character != speaker:
                character_supposed_gender[character][-1].append(1-is_speaker)
                
    character_last_phrase[speaker][-1] = phrases_count 
    ## Train ##
    
    character_phrase_count[speaker] += 1