In [61]:
import json
import random

import re
import string
import os.path
import timeit
from itertools import chain

import numpy as np
import pandas as pd
import _pickle as pickle

import matplotlib.pyplot as plt
import plot_graph

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model, Model
from keras.layers import Input, Dense, Embedding, Flatten, LSTM, Bidirectional, TimeDistributed, Dropout, Activation
from keras.layers import LeakyReLU, concatenate

from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical



In [4]:
# loading data

ip = open("dataV3/data_dump.txt", "rb")
dictionary = pickle.load(ip)
ip.close()

data = dictionary["data"]
labels = dictionary["labels"]


In [7]:
# all tokens
words_all = list(chain.from_iterable([r["tokens"] for r in data]))

# unique tokens
words = list(set(words_all))
# will be used for padding
words.append("ENDPAD")

VOCAB_SIZE = len(words)

# to match the entity list used for previous versions
"""
0 Rachel Green
1 Ross Geller
2 Chandler Bing
3 Monica Geller
4 Joey Tribbiani
5 Phoebe Buffay
6 Others
7 None
"""

# classes
classes = ["Rachel Green", "Ross Geller", "Chandler Bing", "Monica Geller", "Joey Tribbiani", 
                 "Phoebe Buffay", "Others", "None"]

NUM_CLASSES = len(classes)

print("Number of sentences:", len(data))
print("Total number of tokens:", len(words_all))
print("Number of unique tokens:", VOCAB_SIZE)
print("Number of classes:", NUM_CLASSES)


print("\nfrequency of each class")
pd.Series(list(chain.from_iterable(labels))).value_counts()


Number of sentences: 26402
Total number of tokens: 279845
Number of unique tokens: 10117
Number of classes: 8

frequency of each class


None              228134
Others             20937
Ross Geller         7652
Rachel Green        5454
Chandler Bing       4815
Joey Tribbiani      4685
Monica Geller       4099
Phoebe Buffay       4069
dtype: int64

In [8]:
# not using Speaker's embedding as a feature 

MAX_SENT_LEN = 25

word2idx = {w: i for i, w in enumerate(words)}
class2idx = {t: i for i, t in enumerate(classes)}

X = [[word2idx[w] for w in row["tokens"]] for row in data]
X_padded = pad_sequences(X, maxlen = MAX_SENT_LEN, padding = "post", value = word2idx['ENDPAD'], truncating = "post")

Y = [[class2idx[l] for l in row] for row in labels]
Y_padded = pad_sequences(Y, maxlen = MAX_SENT_LEN, padding="post", value = class2idx["None"], truncating = "post")


In [9]:
print(word2idx['ENDPAD'], VOCAB_SIZE)
print(class2idx['Ross Geller'], class2idx['Others'], class2idx['None'])

print("\n W/o padding")
print(X[0])
print(Y[0])

print("\n With padding")
print(X_padded[0])
print(Y_padded[0])


10116 10117
1 6 7

 W/o padding
[9946, 6951, 779, 1770, 2826, 3114, 5275, 2046, 1626]
[6, 7, 7, 7, 6, 3, 7, 7, 7]

 With padding
[ 9946  6951   779  1770  2826  3114  5275  2046  1626 10116 10116 10116
 10116 10116 10116 10116 10116 10116 10116 10116 10116 10116 10116 10116
 10116]
[6 7 7 7 6 3 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]


In [11]:
x_train, x_test, y_train, y_test = train_test_split(X_padded, Y_padded, test_size = 0.2)

# one hot encoding for Y
y_train_ohe = np.array([to_categorical(i, num_classes= NUM_CLASSES) for i in y_train])
y_test_ohe = np.array([to_categorical(i, num_classes= NUM_CLASSES) for i in y_test])

print(x_train[0])
print(y_train[0])
print(y_train_ohe[0])

[ 6649  6335  6757  9663   779  3485  5487   681 10108 10116 10116 10116
 10116 10116 10116 10116 10116 10116 10116 10116 10116 10116 10116 10116
 10116]
[2 7 7 2 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
[[0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]


In [12]:
print("x_train:", type(x_train), x_train.shape)
print("y_train:", type(y_train), y_train.shape)
print("y_train_ohe:", type(y_train_ohe), y_train_ohe.shape)


print("\nx_test:", type(x_test), x_test.shape)
print("y_test:", type(y_test), y_test.shape)
print("y_test_ohe:", type(y_test_ohe), y_test_ohe.shape)


x_train: <class 'numpy.ndarray'> (21121, 25)
y_train: <class 'numpy.ndarray'> (21121, 25)
y_train_ohe: <class 'numpy.ndarray'> (21121, 25, 8)

x_test: <class 'numpy.ndarray'> (5281, 25)
y_test: <class 'numpy.ndarray'> (5281, 25)
y_test_ohe: <class 'numpy.ndarray'> (5281, 25, 8)


In [13]:
encoded_data_path = "modelsV3/"

EPOCHS = 10
BATCH_SIZE = 16

input = Input(shape=(MAX_SENT_LEN,))

model = Embedding(input_dim = VOCAB_SIZE, output_dim = 50, input_length = MAX_SENT_LEN)(input)
model = Dropout(0.1)(model)

model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(NUM_CLASSES, activation="softmax"))(model)  

model = Model(input, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# creating checkpoint to save model every time validation accuracy improves
filepath = encoded_data_path + "val_acc-improvement-{epoch:02d}-{val_acc:.4f}.hdf5"

checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

print('\n\nTraining Model...')

history1 = model.fit(x_train, y_train_ohe,
              batch_size = BATCH_SIZE,
              epochs = EPOCHS,
              callbacks = callbacks_list,
              validation_data = (x_test, y_test_ohe))





Training Model...
Train on 21121 samples, validate on 5281 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.95038, saving model to modelsV3/val_acc-improvement-01-0.9504.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.95038 to 0.95294, saving model to modelsV3/val_acc-improvement-02-0.9529.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.95294 to 0.95393, saving model to modelsV3/val_acc-improvement-03-0.9539.hdf5
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.95393
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.95393
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.95393
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.95393
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.95393
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.95393
Epoch 10/10

Epoch 00010: val_acc improved from 0.95393 to 0.95425, saving model to modelsV3/val_acc-improvement-10-0.9543.hdf5


In [16]:
#model= load_model(encoded_data_path +  "weights-02-0.9058.hdf5")

y_predicted = model.predict(x_test)
y_predicted = y_predicted.argmax(axis= 2)

print(y_test.shape, y_predicted.shape)
print(y_test.ravel().shape, y_predicted.ravel().shape)

print ('\nConfusion Matrix:')

confusion_mat = confusion_matrix(y_test.ravel(), y_predicted.ravel())

print ('\nClassification Report:')
print (classification_report(y_test.ravel(), y_predicted.ravel()))


pd.DataFrame(confusion_mat, columns = classes, index = classes)

(5281, 25) (5281, 25)
(132025,) (132025,)

Confusion Matrix:

Classification Report:
             precision    recall  f1-score   support

          0       0.29      0.25      0.27      1103
          1       0.34      0.41      0.38      1505
          2       0.30      0.26      0.27       971
          3       0.54      0.17      0.26       774
          4       0.36      0.23      0.28       898
          5       0.35      0.26      0.30       781
          6       0.56      0.71      0.63      4230
          7       1.00      1.00      1.00    121763

avg / total       0.95      0.95      0.95    132025



Unnamed: 0,Rachel Green,Ross Geller,Chandler Bing,Monica Geller,Joey Tribbiani,Phoebe Buffay,Others,None
Rachel Green,275,237,92,11,47,70,348,23
Ross Geller,134,623,91,16,75,63,472,31
Chandler Bing,90,149,249,21,61,47,334,20
Monica Geller,88,112,99,130,32,62,244,7
Joey Tribbiani,63,186,90,19,203,36,292,9
Phoebe Buffay,98,123,63,16,25,202,246,8
Others,172,359,147,25,111,91,2998,327
,20,26,12,2,9,7,382,121305


In [58]:
x1 = np.arange(0,15,1)
y1 = history1.history["loss"]
name1 = "Train Loss"

x2 = np.arange(0,15,1)
y2 = history1.history["val_loss"]
name2 = "Validation Loss"

plot_graph.trace(x1, y1, name1, x2, y2, name2, "Train Vs Validation Loss")


In [49]:
# using Speaker's embedding as a feature 

MAX_SENT_LEN2 = 25

# to handle unknown words
words.append("UNK")

word2idx2 = {w: i for i, w in enumerate(words + classes)}
class2idx = {t: i for i, t in enumerate(classes)}

# Adding Speaker at front
X2 = [[word2idx2[row["speaker"]]] + [word2idx[w] for w in row["tokens"]] for row in data]
X_padded2 = pad_sequences(X2, maxlen = MAX_SENT_LEN, padding = "post", value = word2idx['ENDPAD'], truncating = "post")

# Adding None to compensate for adding Speaker at front
Y2 = [[class2idx['None']] + [class2idx[l] for l in row] for row in labels]
Y_padded2 = pad_sequences(Y2, maxlen = MAX_SENT_LEN, padding="post", value = class2idx["None"], truncating = "post")


In [50]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(X_padded2, Y_padded2, test_size = 0.2)

# one hot encoding for Y
y_train_ohe2 = np.array([to_categorical(i, num_classes= NUM_CLASSES) for i in y_train2])
y_test_ohe2 = np.array([to_categorical(i, num_classes= NUM_CLASSES) for i in y_test2])

print("x_train2:", type(x_train2), x_train2.shape)
print("y_train2:", type(y_train2), y_train2.shape)
print("y_train_ohe2:", type(y_train_ohe2), y_train_ohe2.shape)


print("\nx_test2:", type(x_test2), x_test2.shape)
print("y_test2:", type(y_test2), y_test2.shape)
print("y_test_ohe2:", type(y_test_ohe2), y_test_ohe2.shape)


x_train2: <class 'numpy.ndarray'> (21121, 25)
y_train2: <class 'numpy.ndarray'> (21121, 25)
y_train_ohe2: <class 'numpy.ndarray'> (21121, 25, 8)

x_test2: <class 'numpy.ndarray'> (5281, 25)
y_test2: <class 'numpy.ndarray'> (5281, 25)
y_test_ohe2: <class 'numpy.ndarray'> (5281, 25, 8)


In [22]:
print(x_train2[1])
print(y_train2[1])
print(y_train_ohe2[1])

[10118  4175  7689  6335  9663  8055  4795  6335  9663  8055  3954  6713
  8087 10116 10116 10116 10116 10116 10116 10116 10116 10116 10116 10116
 10116]
[7 7 7 7 0 7 7 7 0 7 1 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
[[0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]


In [42]:
encoded_data_path = "modelsV3/"

VOCAB_SIZE2 = len(words + classes)

EPOCHS = 10
BATCH_SIZE = 16

input = Input(shape=(MAX_SENT_LEN2,))

model = Embedding(input_dim = VOCAB_SIZE2, output_dim = 50, input_length = MAX_SENT_LEN2)(input)
model = Dropout(0.2)(model)

# each LSTM unit returns a sequence of output (of length same as i/p), one for each time step in the input data
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.2))(model)

# It highlights that we intend to output NUM_CLASSES time step from the sequence for each time step in the input.
# 
out = TimeDistributed(Dense(NUM_CLASSES, activation="softmax"))(model)  

model = Model(input, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# creating checkpoint to save model every time validation accuracy improves
filepath = encoded_data_path + "val_acc2-improvement-{epoch:02d}-{val_acc:.4f}.hdf5"

checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

print('\n\nTraining Model...')

history2 = model.fit(x_train2, y_train_ohe2,
              batch_size = BATCH_SIZE,
              epochs = EPOCHS,
              callbacks = callbacks_list,
              validation_data = (x_test2, y_test_ohe2))





Training Model...
Train on 21121 samples, validate on 5281 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.96058, saving model to modelsV3/val_acc2-improvement-01-0.9606.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.96058 to 0.97313, saving model to modelsV3/val_acc2-improvement-02-0.9731.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.97313 to 0.97379, saving model to modelsV3/val_acc2-improvement-03-0.9738.hdf5
Epoch 4/10

Epoch 00004: val_acc improved from 0.97379 to 0.97474, saving model to modelsV3/val_acc2-improvement-04-0.9747.hdf5
Epoch 5/10

Epoch 00006: val_acc did not improve from 0.97474
Epoch 7/10

Epoch 00007: val_acc improved from 0.97474 to 0.97488, saving model to modelsV3/val_acc2-improvement-07-0.9749.hdf5
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.97488
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.97488
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.97488


In [43]:
model= load_model("modelsV3/val_acc2-improvement-07-0.9749.hdf5")

y_predicted = model.predict(x_test2)
y_predicted = y_predicted.argmax(axis= 2)

print(y_test2.shape, y_predicted.shape)
print(y_test2.ravel().shape, y_predicted.ravel().shape)

print ('\nConfusion Matrix:')

confusion_mat = confusion_matrix(y_test2.ravel(), y_predicted.ravel())

print ('\nClassification Report:')
print (classification_report(y_test2.ravel(), y_predicted.ravel()))

pd.DataFrame(confusion_mat, columns = classes, index = classes)

(5281, 25) (5281, 25)
(132025,) (132025,)

Confusion Matrix:

Classification Report:
             precision    recall  f1-score   support

          0       0.75      0.67      0.71      1018
          1       0.69      0.62      0.65      1541
          2       0.85      0.66      0.74       954
          3       0.93      0.66      0.77       773
          4       0.69      0.63      0.66       964
          5       0.79      0.70      0.74       803
          6       0.66      0.83      0.74      4099
          7       1.00      1.00      1.00    121873

avg / total       0.98      0.97      0.97    132025



Unnamed: 0,Rachel Green,Ross Geller,Chandler Bing,Monica Geller,Joey Tribbiani,Phoebe Buffay,Others,None
Rachel Green,683,40,12,0,29,15,229,10
Ross Geller,26,951,20,6,91,32,387,28
Chandler Bing,32,50,625,4,16,16,193,18
Monica Geller,29,47,8,508,25,16,137,3
Joey Tribbiani,18,73,5,6,606,25,219,12
Phoebe Buffay,18,50,7,0,15,566,137,10
Others,80,151,42,13,85,38,3420,270
,19,17,14,11,10,10,442,121350


In [53]:
x1 = np.arange(0,11,1)
y1 = history2.history["loss"]
name1 = "Train Loss"

x2 = np.arange(0,11,1)
y2 = history2.history["val_loss"]
name2 = "Validation Loss"

plot_graph.trace(x1, y1, name1, x2, y2, name2, "Train Vs Validation Loss")

In [51]:
encoded_data_path = "modelsV3/"

VOCAB_SIZE2 = len(words + classes)

EPOCHS = 10
BATCH_SIZE = 16

input = Input(shape=(MAX_SENT_LEN2,))

model = Embedding(input_dim = VOCAB_SIZE2, output_dim = 50, input_length = MAX_SENT_LEN2)(input)
model = Dropout(0.2)(model)

# each LSTM unit returns a sequence of output (of length same as i/p), one for each time step in the input data
model = LSTM(units=100, return_sequences=True, recurrent_dropout=0.2, activation='relu')(model)

model = TimeDistributed(Dense(32, activation="relu"))(model) 

out = TimeDistributed(Dense(NUM_CLASSES, activation="softmax"))(model)  

model = Model(input, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# creating checkpoint to save model every time validation accuracy improves
filepath = encoded_data_path + "val_acc3-improvement-{epoch:02d}-{val_acc:.4f}.hdf5"

checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

print('\n\nTraining Model...')

history3 = model.fit(x_train2, y_train_ohe2,
              batch_size = BATCH_SIZE,
              epochs = EPOCHS,
              callbacks = callbacks_list,
              validation_data = (x_test2, y_test_ohe2))





Training Model...
Train on 21121 samples, validate on 5281 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.95977, saving model to modelsV3/val_acc3-improvement-01-0.9598.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.95977 to 0.97253, saving model to modelsV3/val_acc3-improvement-02-0.9725.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.97253 to 0.97341, saving model to modelsV3/val_acc3-improvement-03-0.9734.hdf5
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.97341
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.97341
Epoch 6/10

Epoch 00006: val_acc improved from 0.97341 to 0.97357, saving model to modelsV3/val_acc3-improvement-06-0.9736.hdf5
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.97357
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.97357
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.97357
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.97357


In [52]:
model= load_model("modelsV3/val_acc3-improvement-06-0.9736.hdf5")

y_predicted = model.predict(x_test2)
y_predicted = y_predicted.argmax(axis= 2)

print(y_test2.shape, y_predicted.shape)
print(y_test2.ravel().shape, y_predicted.ravel().shape)

print ('\nConfusion Matrix:')

confusion_mat = confusion_matrix(y_test2.ravel(), y_predicted.ravel())

print ('\nClassification Report:')
print (classification_report(y_test2.ravel(), y_predicted.ravel()))

pd.DataFrame(confusion_mat, columns = classes, index = classes)

(5281, 25) (5281, 25)
(132025,) (132025,)

Confusion Matrix:

Classification Report:
             precision    recall  f1-score   support

          0       0.76      0.67      0.71      1096
          1       0.73      0.65      0.69      1454
          2       0.91      0.60      0.72       959
          3       0.96      0.69      0.80       792
          4       0.70      0.63      0.67       947
          5       0.89      0.66      0.76       741
          6       0.61      0.87      0.72      4101
          7       1.00      0.99      1.00    121935

avg / total       0.98      0.97      0.97    132025



Unnamed: 0,Rachel Green,Ross Geller,Chandler Bing,Monica Geller,Joey Tribbiani,Phoebe Buffay,Others,None
Rachel Green,731,43,6,1,16,6,280,13
Ross Geller,15,945,14,1,71,9,368,31
Chandler Bing,33,32,578,2,14,12,278,10
Monica Geller,17,42,5,547,19,6,153,3
Joey Tribbiani,35,38,2,0,599,4,254,15
Phoebe Buffay,19,41,6,0,12,491,165,7
Others,92,130,23,10,102,10,3556,178
,21,15,4,8,19,16,764,121088
