### Step 1: extracting dialogs and explicit speakers from raw text (deterministic method) => done in Preprocessing.py

In [1]:
import pickle
with open("corpus/dataset.pkl", 'rb') as file:
    annotated_lines = pickle.load(file)

In [2]:
annotated_lines[0]

{'only_utterance_us': 'My dear Mr_Bennet, [X] have you heard that Netherfield Park is let at last?',
 'source': "``My dear Mr_Bennet,'' said his lady to him one day, ``have you heard that Netherfield Park is let at last?''",
 'parts': [{'text': 'My dear Mr_Bennet,', 'utterance': True},
  {'text': ' said his lady to him one day, ',
   'utterance': False,
   'speaker_name': None,
   'speaker_function': 'lady',
   'speaker_gender': 'F'},
  {'text': 'have you heard that Netherfield Park is let at last?',
   'utterance': True}],
 'only_utterance_article': 'My dear Mr_Bennet, [X] have you heard that Netherfield Park is let at last?',
 'target': 'Mrs_Bennet'}

### Step 2: extracting features

In [3]:
import pandas as pd

In [4]:
characters = ["Mrs. Annesley",
"Elizabeth Bennet",
"Jane Bennet",              
"Lydia Bennet",
"Kitty Bennet",
"Mary Bennet",
"Mrs. Bennet",
"Mr. Bennet",
"Mr. Bingley",
"Caroline Bingley",
"Charlotte",
"Captain Carter",
"Mr. Collins",
"Lady Catherine",
"Mr. Chamberlayne",
"Dawson",
"Mr. Denny",
"Mr. Darcy",
"Old Mr. Darcy",
"Lady Anne Darcy",
"Georgiana Darcy",
"Colonel Fitzwilliam",
"Colonel Forster",
"Miss Grantley",
"Mrs. Gardiner",
"Mr. Gardiner",
"William Goulding",
"Haggerston",
"Mrs. Hill",
"Mrs. Jenkinson",
"Mr. Jones",
"Miss Mary King",
"Mrs. Long",
"Lady Lucas",
"Maria Lucas",
"Mr. Hurst",
"Louisa Hurst",
"Lady Metcalfe",
"Mr. Morris",
"Mrs. Nicholls",
"Mr. Philips",
"Miss Pope",
"Mr. Pratt",
"Mrs. Reynolds",
"Mr. Robinson",
"Mr. Stone",
"Miss Watson",
"Old Mr. Wickham",
"Sir William",
"Anne de Bourgh",
"Mr. Wickham",
"Mrs. Philips",
"Young Lucas",
"The Butler"]

gender = {"Mrs. Annesley" : "F",
"Elizabeth Bennet" : "F",
"Jane Bennet" : "F",         
"Lydia Bennet" : "F",
"Kitty Bennet" : "F",
"Mary Bennet" : "F",
"Mrs. Bennet" : "F",
"Mr. Bennet" : "M",
"Mr. Bingley" : "M",
"Caroline Bingley" : "F",
"Charlotte" : "F",
"Captain Carter" : "M",
"Mr. Collins" : "M",
"Lady Catherine" : "F",
"Mr. Chamberlayne" : "M",
"Dawson" : "M",
"Mr. Denny" : "F",
"Mr. Darcy" : "M",
"Old Mr. Darcy" : "M",
"Lady Anne Darcy" : "F",
"Georgiana Darcy" : "F",
"Colonel Fitzwilliam" : "F",
"Colonel Forster" : "M",
"Miss Grantley" : "F",
"Mrs. Gardiner" : "F",
"Mr. Gardiner" : "M",
"William Goulding" : "M",
"Haggerston" : "M",
"Mrs. Hill" : "F",
"Mrs. Jenkinson" : "F",
"Mr. Jones" : "M",
"Miss Mary King" : "F",
"Mrs. Long" : "F",
"Lady Lucas" : "F",
"Maria Lucas" : "F",
"Mr. Hurst" : "M",
"Louisa Hurst" : "F",
"Lady Metcalfe" : "F",
"Mr. Morris" : "M",
"Mrs. Nicholls" : "F",
"Mr. Philips" : "M",
"Miss Pope" : "F",
"Mr. Pratt" : "M",
"Mrs. Reynolds" : "F",
"Mr. Robinson" : "M",
"Mr. Stone" : "M",
"Miss Watson" : "F",
"Old Mr. Wickham" : "M",
"Sir William" : "M",
"Anne de Bourgh" : "F",
"Mr. Wickham" : "M",
"Mrs. Philips" : "F",
"Young Lucas" : "M",
"The Butler" : "M"}

features = ["character_freq", 
            "character_previous_mention",
            "character_dialog_mention",
            "character_vocal_mention",
            "character_narrator_mention",
            "gender_as_supposed",
            "character_spoke_last",
            "character_spoke_before_last",
            "character_is_target",
            "character_last_target",
            "character_already_spoke"]

columns = ["dialog", "speaker"] + [feature + "_" + character for feature in features for character in characters]
columns

['dialog',
 'speaker',
 'character_freq_Mrs. Annesley',
 'character_freq_Elizabeth Bennet',
 'character_freq_Jane Bennet',
 'character_freq_Lydia Bennet',
 'character_freq_Kitty Bennet',
 'character_freq_Mary Bennet',
 'character_freq_Mrs. Bennet',
 'character_freq_Mr. Bennet',
 'character_freq_Mr. Bingley',
 'character_freq_Caroline Bingley',
 'character_freq_Charlotte',
 'character_freq_Captain Carter',
 'character_freq_Mr. Collins',
 'character_freq_Lady Catherine',
 'character_freq_Mr. Chamberlayne',
 'character_freq_Dawson',
 'character_freq_Mr. Denny',
 'character_freq_Mr. Darcy',
 'character_freq_Old Mr. Darcy',
 'character_freq_Lady Anne Darcy',
 'character_freq_Georgiana Darcy',
 'character_freq_Colonel Fitzwilliam',
 'character_freq_Colonel Forster',
 'character_freq_Miss Grantley',
 'character_freq_Mrs. Gardiner',
 'character_freq_Mr. Gardiner',
 'character_freq_William Goulding',
 'character_freq_Haggerston',
 'character_freq_Mrs. Hill',
 'character_freq_Mrs. Jenkinson',
 '

In [5]:
names = [character for character in characters]

In [6]:
import traceback, sys

In [7]:
import string
def split_words(text):
    "remove all punctuation and split by spaces"
    table = str.maketrans({key: None for key in string.punctuation})
    return text.translate(table) .split(" ")

In [8]:
dialogs=open("./corpus/curated_dialogs.txt", "w")
for count , line in enumerate(annotated_lines):
    dialog = str(count+1)
    speaker = line["target"]
    text = line["source"]
    phrase = dialog + "\t" + speaker + "\t" + text+"\n"
    dialogs.write(phrase)

In [9]:
dialogs = open("./corpus/curated_dialogs.txt", "r").read().split('\n')[:-1]
dataset = pd.DataFrame([], columns=columns)

dialog_index = 0
line_idx = 0
last_speaker = ""
before_last_speaker = ""
last_target = ""
previous_mentions = set()
vocal_mentions = set()
for count, (phrase, annotated_line) in enumerate(zip(dialogs, annotated_lines)):
    try:
        dialog, speaker, text = phrase.split("\t")

        if int(dialog) > dialog_index:
            character_dialog_mentions = {name: False for name in names}
            spokers = set()
            last_speaker = ""
            before_last_speaker = ""
            last_target = ""
            dialog_index = int(dialog)
            previous_mentions = set()
        
        line = {"dialog": int(dialog), "speaker": speaker}
        previous_mentions = vocal_mentions
        vocal_mentions = set(names).intersection(set(split_words(text)))
        
        words = set([word for part in annotated_line['parts'] for word in split_words(part['text']) if not part['utterance']])
        narrator_mentions = set(names).intersection(words)
            
        for idx, character in enumerate(characters):
            line["character_already_spoke_" + character] = character in spokers
            line["character_freq_" + character] = 0.0
            
            line["character_previous_mention_" + character] = character in previous_mentions
            line["character_vocal_mention_" + character] = character in vocal_mentions
            line["character_narrator_mention_" + character] = character in narrator_mentions
            line["character_dialog_mention_" + character] = character_dialog_mentions[character]
                
            character_dialog_mentions[character] = character_dialog_mentions[character] or\
                                                        character in vocal_mentions          or\
                                                        character in narrator_mentions
            supposed_gender = None
            for part in annotated_line["parts"]:
                if("speaker_gender" in part and part["speaker_gender"] is not None):
                    supposed_gender = part["speaker_gender"]
            
            if(supposed_gender is None):
                gender_as_supposed = 0.5
            elif(supposed_gender == gender[character]):
                gender_as_supposed = 1
            else:
                gender_as_supposed = 0
            line["gender_as_supposed_" + character] = gender_as_supposed
                
            line["character_last_target_" + character] = True if last_target == character else False
            line["character_spoke_last_" + character] = True if last_speaker == character else False
            line["character_spoke_before_last_" + character] = True if before_last_speaker == character else False
            
            if(annotated_line["target"] is not None and annotated_line["target"] == character):
                character_is_target = True
            else:
                character_is_target = False
            line["character_is_target_" + character] = character_is_target
        
        spokers.add(speaker)
        before_last_speaker = last_speaker
        last_speaker = speaker
        dataset.loc[line_idx] = line
        line_idx += 1
    except Exception as e:
        print(traceback.format_exception(None, # <- type(e) by docs, but ignored 
                                     e, e.__traceback__),
          file=sys.stderr, flush=True)
        print("line {}, caused a problem: {}".format(count, e))
dataset["dialog"] = dataset.dialog.astype(int)

In [10]:
dataset.head()

Unnamed: 0,dialog,speaker,character_freq_Mrs. Annesley,character_freq_Elizabeth Bennet,character_freq_Jane Bennet,character_freq_Lydia Bennet,character_freq_Kitty Bennet,character_freq_Mary Bennet,character_freq_Mrs. Bennet,character_freq_Mr. Bennet,...,character_already_spoke_Mr. Robinson,character_already_spoke_Mr. Stone,character_already_spoke_Miss Watson,character_already_spoke_Old Mr. Wickham,character_already_spoke_Sir William,character_already_spoke_Anne de Bourgh,character_already_spoke_Mr. Wickham,character_already_spoke_Mrs. Philips,character_already_spoke_Young Lucas,character_already_spoke_The Butler
0,1,Mrs_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,2,Mrs_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,3,Mrs_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
3,4,Mr_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,5,Mrs_Bennet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False


### Step 3: Train Valid Test split

In [11]:
import numpy as np
np.random.seed(9295)
dialogs = np.arange(dataset.dialog.max())
np.random.shuffle(dialogs)

In [12]:
b1 = len(dialogs) * 8 // 10
b2 = len(dialogs) * 9 // 10
train_dialogs = dialogs[:b1]
valid_dialogs = dialogs[b1:b2]
test_dialogs  = dialogs[b2:]
print("Train set contains {} dialogs.".format(len(train_dialogs)))
print("Valid set contains {} dialogs.".format(len(valid_dialogs)))
print("Test set contains {} dialogs.".format(len(test_dialogs)))

Train set contains 1032 dialogs.
Valid set contains 129 dialogs.
Test set contains 130 dialogs.


In [13]:
train_dataset = dataset[dataset.dialog.isin(train_dialogs)]
valid_dataset = dataset[dataset.dialog.isin(valid_dialogs)]
test_dataset  = dataset[dataset.dialog.isin(test_dialogs)]

compute the frequencies based on the train set

In [14]:
pd.options.mode.chained_assignment = None  # default='warn'
for name in names:
    freq = len(train_dataset[train_dataset["speaker"] == name]) / len(train_dataset.index)
    train_dataset["character_freq_" + name] = freq
    valid_dataset["character_freq_" + name] = freq
    test_dataset["character_freq_" + name]  = freq

### Step 4: Training

In [15]:
from sklearn.ensemble import GradientBoostingClassifier

In [16]:
GB = GradientBoostingClassifier(learning_rate=0.02, n_estimators=500, subsample=0.95, min_samples_split=2,
                                min_samples_leaf=2, max_depth=4, verbose=2)

In [17]:
y = train_dataset.speaker.values
X = train_dataset.drop(["dialog", "speaker"], axis=1).values.astype(np.float32)

In [18]:
GB.fit(X, y)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1        2356.5434           1.2552            1.03m
         2        2367.5603           0.8736           57.99s
         3        2341.5352           0.4894           56.15s
         4        2339.0699           0.5240           54.93s
         5        2309.8978           0.1030           55.23s
         6        2306.1318           0.3184           55.36s
         7        2295.6879           0.0375           54.76s
         8        2303.6658           0.3085           54.03s
         9        2288.4350           0.2805           54.19s
        10        2288.5529           0.2347           55.65s
        11        2281.8489           0.0321           55.03s
        12        2282.5997           0.0850           54.85s
        13        2280.3077           0.2440           54.86s
        14        2248.7679           0.0747           54.84s
        15        2259.0823           0.0611           54.43s
       

       134        2148.8985          -0.0310           39.98s
       135        2129.1464          -0.0449           39.87s
       136        2141.4801          -0.0438           39.76s
       137        2139.6811          -0.0536           39.64s
       138        2126.2798          -0.0476           39.51s
       139        2142.8268          -0.0891           39.40s
       140        2148.5783          -0.1382           39.29s
       141        2132.3134          -0.0809           39.17s
       142        2123.8140          -0.0891           39.05s
       143        2138.4512          -0.0901           38.94s
       144        2122.0895          -0.1111           38.84s
       145        2135.5031          -0.0359           38.71s
       146        2140.8116          -0.0590           38.59s
       147        2145.6427          -0.0481           38.49s
       148        2140.6906          -0.0798           38.38s
       149        2139.6264          -0.0822           38.26s
       1

       268        2140.9320          -0.0842           25.02s
       269        2128.0685          -0.0811           24.91s
       270        2123.0518          -0.1103           24.80s
       271        2132.9679          -0.1169           24.69s
       272        2127.4066          -0.0577           24.58s
       273        2132.0479          -0.0996           24.47s
       274        2128.8957          -0.0620           24.35s
       275        2123.9031          -0.0708           24.24s
       276        2143.7298          -0.1110           24.13s
       277        2127.5675          -0.0655           24.02s
       278        2119.9594          -0.0641           23.90s
       279        2140.4071          -0.1180           23.80s
       280        2144.5164          -0.0429           23.68s
       281        2125.3037          -0.1367           23.57s
       282        2110.8538          -0.0716           23.46s
       283        2139.3073          -0.0351           23.35s
       2

       402        2114.8658          -0.0700           10.39s
       403        2132.5477          -0.0441           10.28s
       404        2133.2901          -0.0696           10.17s
       405        2135.6252          -0.1136           10.07s
       406        2121.4239          -0.0775            9.96s
       407        2126.3953          -0.1288            9.85s
       408        2133.1244          -0.1063            9.74s
       409        2117.3630          -0.0919            9.64s
       410        2129.2333          -0.0758            9.53s
       411        2122.9390          -0.1258            9.42s
       412        2129.4847          -0.0794            9.32s
       413        2136.8476          -0.1897            9.21s
       414        2124.2015          -0.0445            9.10s
       415        2117.0524          -0.0986            9.00s
       416        2126.8788          -0.0958            8.89s
       417        2129.9971          -0.0631            8.78s
       4

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.02, loss='deviance', max_depth=4,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=2, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=0.95, tol=0.0001,
                           validation_fraction=0.1, verbose=2,
                           warm_start=False)

### Step 5: Results

In [19]:
pred = []
truth = []

dialogs_done = set()
last_speaker = ""
spokers = set()
for idx in valid_dataset.index:
    speaker_name = None
    for part in annotated_lines[idx]["parts"]:
        if 'speaker_name' in part and part['speaker_name'] is not None:
            speaker_name = part['speaker_name']
            break
    if speaker_name is not None:
        speaker = speaker_name
    else:
        line = valid_dataset.loc[idx]
        if line.dialog not in dialogs_done:
            spokers = set()
            last_speaker = ""
            dialogs_done.add(line.dialog)

        for character in characters:
            line["character_already_spoke_" + character] = character in spokers
            line["character_spoke_last_" + character] = True if last_speaker == character else False

        truth.append(speaker)

        speaker = GB.predict(line.values[2:].astype(np.float32).reshape(1,-1))[0]
    pred.append(speaker)
    last_speaker = speaker
    spokers.add(speaker)

In [20]:
print("Precision: {:.02f}%".format(100*sum([p == t for p, t in zip(pred, truth)]) / len(truth)))

Precision: 67.33%


### Explicit mention of speaker

In [21]:
dialogs = open("./corpus/curated_dialogs.txt", "r").read().split('\n')[:-1]
found = 0
correct = 0
for count, (phrase, annotated_line) in enumerate(zip(dialogs, annotated_lines)):
    dialog, speaker, text = phrase.split("\t")
    speaker_name = None
    for part in annotated_line["parts"]:
        if 'speaker_name' in part and part['speaker_name'] is not None:
            speaker_name = part['speaker_name']
            break
    if speaker_name is not None:
        found += 1
        if speaker_name == speaker:
            correct += 1
print("identified speaker: {:.02f}".format(100*found/len(dialogs)))
print("correct identification: {:.02f}".format(100*correct/found))
print("total precision: {:.02f}".format(100*correct/len(dialogs)))

identified speaker: 25.19
correct identification: 87.73
total precision: 22.10
