In [1]:
import keras

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
import pickle
import os
import re
import time

from keras import backend as K
import tensorflow as tf

import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.stem import SnowballStemmer
from tqdm import tqdm

In [3]:
np.set_printoptions(linewidth=100, precision=4)

In [61]:
params = {
    "UTTERANCE_LENGTH": 60,
    "DISCUSSION_LENGTH": 48, #60,
    "NARRATION_LENGTH": 30,
    "MAX_WORDS_COUNT": 3000,
    "WORD_EMBEDDING_DIM": 32,
    "PEOPLE_EMBEDDING_DIM": 32,
    "VALIDATION_SPLIT": 0.2,
    "RECURRENT_UNITS_COUNT": 32,
    "PATIENCE": 2,
    "EPOCHS": 8,
    "BATCH_SIZE": 16, # not large because every sample is 300 words long
    "LR": 0.001,
    "MSE_LOSS_WEIGHT": 500,
}


TIME_STR = time.strftime("%Y%m%d-%H%M%S")
OUT_MODEL_PATH = os.path.join('./output/', "model-{}.h5".format(TIME_STR))
CHECKPOINT_PATH = os.path.join('./output/', "model-{}-checkpoint.h5".format(TIME_STR))
GLOVE_PATH = '/Users/perceval/Developpement/Data/glove.6B.100d/glove.6B.100d.txt'
DATA_PATH = 'corpus/dataset_dl.pkl'
SPLIT_REGEX = r"[@_\w]+|['.,!?;]"

### Load people

In [5]:
people = pickle.load(open("corpus/people.pkl", "rb"))

In [6]:
dataset = pickle.load(open("corpus/dataset-dl.pkl", "rb"))

In [7]:
dataset[0][0]

('My dear Mr_Bennet,  have you heard that Netherfield Park is let at last?',
 ' said his lady to him one day, ',
 ['Charlotte', 'Mrs_Bennet'],
 ['Mr_Bennet'],
 'Mrs_Bennet')

### Load data and process it (again !)

#### Tokenize text

We are going to stem the data, to make the vocabulary denser

In [8]:
def stem_sentence(stemmer, sentence, people_names):
    tokens = re.findall(SPLIT_REGEX, sentence)
    return [stemmer.stem(word) if word not in people_names else word for word in tokens]

In [9]:
stem_sentence(SnowballStemmer('english'), "Mr_Bennet is with Mrs_Bennet's daughter", [p['main'] for p in people])

['Mr_Bennet', 'is', 'with', 'Mrs_Bennet', "'", 's', 'daughter']

In [10]:
def pad(list, length, default):
    """Pads the `list`, adding `default` as many times as necessary to reach the provided `length`"""
    if length is None:
        return list
    return  [list[i] if i < len(list) else default for i in range(length)]

In [50]:
def make_data(dataset, people, verbose=2, utterance_length=8, narration_length=None, discussion_length=None):
    """Tokenizes, stems and transform into to ids the provided `dataset`
    """
    people_main = [p['main'] for p in people]
    
    stemmer = SnowballStemmer('english')

    stemmed_samples = [
        [(stem_sentence(stemmer, utterance[0], people_main), # utterance part
          stem_sentence(stemmer, utterance[1], people_main), # narration part
          utterance[2], # potential subjects
          utterance[3], # potential destinator
          utterance[4]) # label
         for utterance in discussion]
        for discussion in (tqdm(dataset, desc="Text words stemming")
                           if verbose > 1 else dataset)]
    
    words = [word
             for discussion in stemmed_samples
             for utterance in discussion
             for text in (utterance[0], utterance[1])
             for word in text]
    # Fit the tokenizer on train texts
    word_index, word_counts = np.unique(words, return_counts=True)
    new_indices = sorted(range(len(word_index)), key=lambda i: "0"+word_index[i] if word_index[i] in people_main else "1"+word_index[i])
    word_index = word_index[new_indices]
    word_counts = word_counts[new_indices]
    
    inverse_words = {v: i+2 for i, v in enumerate(word_index)}
    inverse_people = {v: i for i, v in enumerate(people_main)}

    # Convert them to indices and truncate them if they are too large
    tokenized_samples = [
        # Pad the discussion so that its length matches `discussion_length`
        pad([(pad([inverse_words.get(w, 1) for w in utterance[0]], utterance_length, 0),
              pad([inverse_words.get(w, 1) for w in utterance[1]], narration_length, 0),
              [inverse_people.get(p, 0) for p in utterance[2]],
              [inverse_people.get(p, 0) for p in utterance[3]],
              inverse_people.get(utterance[4], -1)+1)
              for utterance in discussion if utterance[4] in inverse_people],
             length=discussion_length,
             default=([0]*(utterance_length or 0), # empty utterance
                      [0]*(narration_length or 0), # empty narration,
                      [], # no target hint
                      [], # no target hint
                      0, # default non-character id
             ))
        for discussion in (tqdm(stemmed_samples, desc="Text/targets to ids mapping")
                           if verbose > 1 else stemmed_samples)]

    return tokenized_samples, word_index

In [51]:
res = make_data(dataset, people,
                utterance_length=params["UTTERANCE_LENGTH"],
                narration_length=params["NARRATION_LENGTH"],
                discussion_length=params["DISCUSSION_LENGTH"])
processed_dataset, word_index = res
#res[0][0][0]

Text words stemming: 100%|██████████| 94/94 [00:02<00:00, 39.83it/s]
Text/targets to ids mapping: 100%|██████████| 94/94 [00:00<00:00, 683.70it/s]


### Shape the dataset as matrices

In [52]:
def make_matrices(data, discussion_length, voc_size, people_count, target_count):
    """Transform a list of samples into a tuple of matrices to feed into the model"""
    utterance_matrices = np.zeros((len(data), discussion_length, voc_size)) # Set of words -> Bag of words one-hot encoding
    narration_matrices = np.zeros((len(data), discussion_length, voc_size)) # Set of words -> Bag of words one-hot encoding
    speaker_matrices = np.zeros((len(data), discussion_length, people_count)) # Set of people -> Bag of words one-hot encoding
    dest_matrices = np.zeros((len(data), discussion_length, people_count)) # Set of people -> Bag of words one-hot encoding
    target_matrices = np.zeros((len(data), discussion_length, target_count)) # Categorical target -> One-hot encoding
    for discussion_i, discussion in enumerate(data):
        for utterance_i, utterance in enumerate(discussion):
            utterance_matrices[discussion_i, utterance_i, utterance[0]] = 1
            narration_matrices[discussion_i, utterance_i, utterance[1]] = 1
            speaker_matrices[discussion_i, utterance_i, list(utterance[2])] = 1
            dest_matrices[discussion_i, utterance_i, list(utterance[3])] = 1
            target_matrices[discussion_i, utterance_i, utterance[4]] = 1
    
    return utterance_matrices, narration_matrices, speaker_matrices, dest_matrices, target_matrices

In [53]:
utterance_matrices, narration_matrices, speaker_hint_matrices, destinator_hint_matrices, target_matrices = \
    make_matrices(processed_dataset,
              discussion_length=params['DISCUSSION_LENGTH'],
              voc_size=len(word_index)+2,
              people_count=len(people),
              target_count=len(people)+1)

### Define the model structure

In [54]:
utterances_input_layer = keras.layers.Input(shape=(None, len(word_index)+2)) # leave the dataset length that will be batched
narrations_input_layer = keras.layers.Input(shape=(None, len(word_index)+2)) # leave the dataset length that will be batched
speaker_hint_input_layer = keras.layers.Input(shape=(None, len(people))) # leave the dataset length that will be batched
dest_hint_input_layer = keras.layers.Input(shape=(None, len(people))) # leave the dataset length that will be batched

#embedding_layer = keras.layers.Embedding(len(word_index), params["WORD_EMBEDDING_DIM"], name="word_embedding")
word_bag_layer = keras.layers.Dense(params["WORD_EMBEDDING_DIM"])#, kernel_regularizer=keras.regularizers.l2(0.01))
people_hint_bag_layer = keras.layers.Dense(params["PEOPLE_EMBEDDING_DIM"])
#dest_hint_bag_layer = keras.layers.Dense(params["PEOPLE_EMBEDDING_DIM"])
concat_layer = keras.layers.Concatenate(name="lstm_input")

concat_input = concat_layer([
    word_bag_layer(utterances_input_layer),
    word_bag_layer(narrations_input_layer),
    people_hint_bag_layer(speaker_hint_input_layer),
    people_hint_bag_layer(dest_hint_input_layer),
])
lstm_input = concat_input#keras.layers.Activation('relu')(concat_input)

# not mandatory to set the LSTM dim output to the people hints dim input but it seems more coherent

lstm_layer = keras.layers.GRU(params['RECURRENT_UNITS_COUNT'], return_sequences=True)
lstm_output = lstm_layer(lstm_input)

output_layer = keras.layers.TimeDistributed(keras.layers.Dense(len(people)+1, activation='softmax'))
#lstm_layer = keras.layers.Dense(len(people)+1, activation='softmax')
output = output_layer(lstm_output)

In [55]:
model = keras.Model(inputs = [utterances_input_layer, narrations_input_layer, speaker_hint_input_layer, dest_hint_input_layer] , outputs = [output])
model.compile(loss='categorical_crossentropy',
              optimizer='adam', 
              metrics=['accuracy'],
              sample_weight_mode='temporal')

### Sample weighting

Because we have imbalanced classes and padding utterances, we need to weights them to correct the loss function

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
(train_utterance_matrices,   test_utterance_matrices,
 train_narration_matrices,   test_narration_matrices,
 train_speaker_hint_matrices, test_speaker_hint_matrices,
 train_dest_hint_matrices, test_dest_hint_matrices,
 train_target_matrices, test_target_matrices) = \
    train_test_split(utterance_matrices, narration_matrices, speaker_hint_matrices, destinator_hint_matrices, target_matrices, test_size=0.2)

In [58]:
# Count the classes
targets_set, targets_inverse, targets_count = np.unique(np.argwhere(train_target_matrices)[:, -1], return_inverse=True, return_counts=True)

# And transform these counts into samples
sample_weight = np.zeros_like(targets_inverse, dtype=float)
sample_weight[targets_inverse == 0] = 0.00
total = sum(targets_count) - targets_count[0]
for target_id, target_count in zip(targets_set, targets_count):
    sample_weight[targets_inverse == target_id] = target_count/total
sample_weight = sample_weight.reshape(train_target_matrices.shape[:2])

In [59]:
# To avoid zero-sum sample weights
#sample_weight[np.argwhere(sample_weight.sum(axis=1) == 0).reshape(-1)] += 0.001

In [66]:
model.fit(x=[train_utterance_matrices, train_narration_matrices, train_speaker_hint_matrices, train_dest_hint_matrices],
          y=[train_target_matrices],
          epochs=50,
          sample_weight=sample_weight,
          validation_split=0.2,
          verbose=1, batch_size=8)

Train on 60 samples, validate on 15 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x141d52630>

In [62]:
target_argmax = test_target_matrices.argmax(axis=2).reshape(-1)
accuracy_score(
    model.predict(x=[test_utterance_matrices, test_narration_matrices, test_speaker_hint_matrices, test_dest_hint_matrices]).argmax(axis=2).reshape(-1),
    test_target_matrices.argmax(axis=2).reshape(-1),
    sample_weight=(target_argmax != 0)
)

0.061611374407582936

In [65]:
target_argmax = train_target_matrices.argmax(axis=2).reshape(-1)
accuracy_score(
    model.predict(x=[train_utterance_matrices, train_narration_matrices, train_speaker_hint_matrices, train_dest_hint_matrices]).argmax(axis=2).reshape(-1),
    train_target_matrices.argmax(axis=2).reshape(-1),
    sample_weight=(target_argmax != 0)
)

0.11853245531514581

### Design fake data to force the model to pay attention to the hints

We use this data to evaluate the performance of a structure according to properties we know it should have

In [62]:
fake_mono_discussion_count = len(people)#+1
fake_utterance_length = 40
fake_mono_utterance_matrices = np.zeros((fake_mono_discussion_count, 1, len(word_index)+2))
fake_mono_narration_matrices = np.zeros((fake_mono_discussion_count, 1, len(word_index)+2))

fake_mono_people_hint_matrices = np.zeros((fake_mono_discussion_count, 1, len(people)))
fake_mono_people_hint_matrices[np.arange(len(people)), 0, np.arange(len(people))] = 1
fake_mono_target_matrices = np.zeros((fake_mono_discussion_count, 1, len(people)+1))
fake_mono_target_matrices[np.arange(len(people)), 0, np.arange(len(people))] = 1

In [63]:
fake_mono_utterance_matrices = fake_mono_utterance_matrices.repeat(50, axis=0)
fake_mono_narration_matrices = fake_mono_narration_matrices.repeat(50, axis=0)
fake_mono_people_hint_matrices = fake_mono_people_hint_matrices.repeat(50, axis=0)
fake_mono_target_matrices = fake_mono_target_matrices.repeat(50, axis=0)

In [64]:
for i in range(fake_mono_utterance_matrices.shape[0]):
    for j in range(fake_mono_utterance_matrices.shape[1]):
        fake_mono_utterance_matrices[i][j][np.random.choice(fake_mono_utterance_matrices.shape[2], fake_utterance_length)] = 1

In [426]:
model.fit(
    x=[fake_mono_utterance_matrices, fake_mono_people_hint_matrices],
    y=[fake_mono_target_matrices],
    verbose=1,
    epochs=5,
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x15a3e6e80>

In [68]:
#display(fake_mono_target_matrices[[0, 50]])
model.evaluate([fake_mono_utterance_matrices, fake_mono_people_hint_matrices], [fake_mono_target_matrices])



[0.4065698538886176, 1.0]

### Design fake data to force the model to pay attention to alternative speakers

We use this data to evaluate the performance of a structure according to properties we know it should have

In [164]:
fake_people_pairs = np.array([np.random.choice(len(people), 2) for _ in range(500)])

fake_duo_discussion_count = len(fake_people_pairs)
fake_duo_discussion_length = 32

fake_duo_utterance_matrices = np.zeros((fake_duo_discussion_count, fake_duo_discussion_length, len(word_index)+2))
fake_duo_narration_matrices = np.zeros((fake_duo_discussion_count, fake_duo_discussion_length, len(word_index)+2))

fake_duo_people_hint_matrices = np.zeros((fake_duo_discussion_count, fake_duo_discussion_length, len(people)))
# get a hint in the first utterance about the first speaker
fake_duo_people_hint_matrices[np.arange(fake_duo_discussion_count), 0, fake_people_pairs[:, 0]] = 1
# get a hint in the second utterance about the second speaker
fake_duo_people_hint_matrices[np.arange(fake_duo_discussion_count), 1, fake_people_pairs[:, 1]] = 1

for i in range(fake_duo_utterance_matrices.shape[0]):
    for j in range(fake_duo_utterance_matrices.shape[1]):
        fake_duo_utterance_matrices[i][j][np.random.choice(fake_duo_utterance_matrices.shape[2], fake_utterance_length)] = 1
        
fake_duo_target_matrices = np.zeros((fake_duo_discussion_count, fake_duo_discussion_length, len(people)+1))
for i in range(fake_duo_discussion_length//2):
    fake_duo_target_matrices[np.arange(fake_duo_discussion_count), i*2, fake_people_pairs[:, 0]] = 1
    fake_duo_target_matrices[np.arange(fake_duo_discussion_count), i*2+1, fake_people_pairs[:, 1]] = 1

In [None]:
#display(fake_mono_target_matrices[[0, 50]])
model.evaluate([fake_duo_utterance_matrices, fake_duo_people_hint_matrices], [fake_duo_target_matrices])