In [1]:
import re
import logging
import pandas as pd
import numpy as np
# remove warnings
logging.basicConfig(level=logging.ERROR)

In [2]:
documents = pd.read_csv('./datos/simpsons_dataset.csv')
documents = documents.dropna()
documents = documents.reset_index(drop=True)
documents = documents.drop_duplicates()
print(documents.shape)
documents.head()

(126646, 2)


Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [3]:
# We just need to run this code once, the function glove2word2vec saves the Glove embeddings in the word2vec format 
# that will be loaded in the next section
from gensim.scripts.glove2word2vec import glove2word2vec

glove_filename = './datos/glove.6B.100d.txt'

word2vec_output_file = glove_filename+'.word2vec'
glove2word2vec(glove_filename, word2vec_output_file)

  glove2word2vec(glove_filename, word2vec_output_file)


(400000, 100)

In [4]:
from gensim.models import KeyedVectors
# load the Stanford GloVe model
word2vec_output_file = glove_filename+'.word2vec'
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [5]:
#Show a word embedding
print('King: ',model.get_vector('king'))

King:  [-0.32307  -0.87616   0.21977   0.25268   0.22976   0.7388   -0.37954
 -0.35307  -0.84369  -1.1113   -0.30266   0.33178  -0.25113   0.30448
 -0.077491 -0.89815   0.092496 -1.1407   -0.58324   0.66869  -0.23122
 -0.95855   0.28262  -0.078848  0.75315   0.26584   0.3422   -0.33949
  0.95608   0.065641  0.45747   0.39835   0.57965   0.39267  -0.21851
  0.58795  -0.55999   0.63368  -0.043983 -0.68731  -0.37841   0.38026
  0.61641  -0.88269  -0.12346  -0.37928  -0.38318   0.23868   0.6685
 -0.43321  -0.11065   0.081723  1.1569    0.78958  -0.21223  -2.3211
 -0.67806   0.44561   0.65707   0.1045    0.46217   0.19912   0.25802
  0.057194  0.53443  -0.43133  -0.34311   0.59789  -0.58417   0.068995
  0.23944  -0.85181   0.30379  -0.34177  -0.25746  -0.031101 -0.16285
  0.45169  -0.91627   0.64521   0.73281  -0.22752   0.30226   0.044801
 -0.83741   0.55006  -0.52506  -1.7357    0.4751   -0.70487   0.056939
 -0.7132    0.089623  0.41394  -1.3363   -0.61915  -0.33089  -0.52881
  0.16483  -

In [6]:
documents.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [7]:
# convert df to dictionary of lists key first column value second column
def convert_to_dict(df):
    characters = {}
    for i in range(len(documents)):
        row = documents.iloc[i]
        key = row['raw_character_text'].lower()
        if key not in characters:
            characters[key] = []
        characters[key].append(row['spoken_words'])
    return characters

In [8]:
characters = convert_to_dict(documents)

In [9]:
def filter_characters_with_less_than_five_sentences(characters):
    filtered_characters = {}
    for key, value in characters.items():
        if len(value) > 5:
            filtered_characters[key] = value
    return filtered_characters

In [10]:
characters = filter_characters_with_less_than_five_sentences(characters)

In [11]:
# extract vector for a sentence
def get_vector_sentence(sentence: str, model: KeyedVectors):
    sentence = sentence.lower()
    words = sentence.split()
    vector = np.zeros(100)
    counter = 0
    for word in words:
        try:
            vector += model.get_vector(word)
            counter += 1
        except:
            pass
    if counter > 0:
        vector = vector / counter
    return vector


In [12]:
# test get_vector_sentence function
get_vector_sentence('Kids, You Tried Your Best And You Failed Miserably. The Lesson Is, Never Try.', model)

array([-4.16756973e-02,  2.20105797e-01,  4.33127097e-01, -2.99525002e-01,
       -2.36539803e-01,  2.56168398e-01, -1.82075300e-01,  9.35989976e-02,
       -2.26369873e-03, -1.62461992e-02,  3.50033002e-01,  3.96226041e-02,
        6.35688014e-02,  1.38804903e-01, -1.21304397e-01, -1.02700445e-01,
        1.70876680e-01,  2.78798301e-01, -6.20794398e-01,  3.39901002e-01,
        1.05124820e-01, -2.98347034e-02,  1.16399007e-01, -2.67723906e-01,
        1.87131000e-01,  1.26932808e-01, -4.64455090e-01, -6.56939998e-01,
        3.25197880e-01, -3.69602996e-01, -1.40659975e-02,  7.84993008e-01,
        6.52636515e-02,  1.44387382e-01,  2.03569971e-02,  2.58452199e-01,
       -2.33882001e-01,  2.03594780e-01,  2.36797002e-01, -3.10310204e-01,
       -3.41002197e-01, -1.36337866e-01,  7.51800984e-02, -5.39280000e-01,
       -3.15784391e-01,  1.40125496e-01,  1.16979796e-01, -4.33540998e-01,
        7.88982997e-02, -9.76410013e-01, -1.31981299e-01,  9.49864600e-02,
        7.50204956e-02,  

In [13]:
def get_sublist(list, n):
    return [list[i:i+n] for i in range(0, len(list), n)]



In [14]:
# extract vectors for all characters in groups of five sentences
def get_vectors_characters(characters: dict) -> dict:
    characters_vectors = {}
    for key, value in characters.items():
        if key not in characters_vectors:
            characters_vectors[key] = []
        sentences_big = get_sublist(value, 5)
        for sentences in sentences_big:
            sentences_counter = 0
            sentence_vector = np.zeros(100)
            for sentence in sentences:
                sentence_vector += get_vector_sentence(sentence, model)
                sentences_counter += 1
            if sentences_counter > 0:
                sentence_vector = sentence_vector / sentences_counter
            characters_vectors[key].append(sentence_vector)
    return characters_vectors

In [15]:
characters_vectors= get_vectors_characters(characters)



In [16]:
len(characters_vectors.keys())

1252

In [17]:
characters_selected = {}
for key, value in characters_vectors.items():
    if len(characters_vectors[key]) > 1000:
        characters_selected[key] = value

In [18]:
lineas = ([ len(characters_selected[key]) for key in characters_selected.keys()])

X = np.zeros((np.sum(lineas), 100))
for key, value in characters_selected.items():
    for i in range(len(value)):
        X[i*len(value):(i+1)*len(value), :] = value[i]

y = np.concatenate( [np.zeros(lineas[0]), np.ones(lineas[1]),np.ones(lineas[2])*2,np.ones(lineas[3])*3]).astype(int)

In [19]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [20]:
from scipy import misc  
from keras import layers  
from keras.layers import Flatten, Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D, Dropout  
from keras.models import Sequential, Model, load_model  
from keras.utils import layer_utils, np_utils  
from keras.utils.data_utils import get_file  
from keras.applications.imagenet_utils import preprocess_input  
from keras.utils.vis_utils import model_to_dot  
from keras.callbacks import ModelCheckpoint  
from sklearn.metrics import confusion_matrix, classification_report  
import tensorflow as tf  

## **Arquitectura 1**

In [21]:
def create_simple_nn():  
    model = Sequential()
    model.add(Dense(100,  activation='relu', name="Input_layer"))
    model.add(Dense(50, activation='relu', name="Hidden_layer_1"))
    model.add(Dense(50, activation='relu', name="Hidden_layer_2"))
    model.add(Dense(4, activation='softmax', name="Output_layer"))

    return model

In [22]:
snn_model = create_simple_nn()  
snn_model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc', 'mse'])  

In [23]:
def convert_to_categorical(a):
    b = np.zeros((a.size, a.max()+1))
    b[np.arange(a.size),a] = 1
    return b

y_train_categorical = convert_to_categorical(y_train)
y_test_categorical = convert_to_categorical(y_test)

In [24]:
snn = snn_model.fit(x=X_train, y=y_train_categorical, batch_size=100, epochs=10, validation_data=(X_test, y_test_categorical), shuffle=True)


Epoch 1/10


2022-07-12 22:44:05.051469: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
snn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_layer (Dense)         (None, 100)               10100     
                                                                 
 Hidden_layer_1 (Dense)      (None, 50)                5050      
                                                                 
 Hidden_layer_2 (Dense)      (None, 50)                2550      
                                                                 
 Output_layer (Dense)        (None, 4)                 204       
                                                                 
Total params: 17,904
Trainable params: 17,904
Non-trainable params: 0
_________________________________________________________________


In [26]:
snn_model.evaluate(X_test, y_test_categorical)



[0.38870611786842346, 0.8919587731361389, 0.04840464890003204]

In [27]:
snn_pred = snn_model.predict(X_test, batch_size=100, verbose=1) 
snn_predicted = np.argmax(snn_pred, axis=1)  




In [28]:
characters_selected.keys()

dict_keys(['lisa simpson', 'bart simpson', 'homer simpson', 'marge simpson'])

In [29]:
snn_report = classification_report(np.argmax(y_test_categorical, axis=1), snn_predicted)  
print(snn_report)

              precision    recall  f1-score   support

           0       0.83      1.00      0.90       423
           1       0.80      0.82      0.81       492
           2       0.93      0.90      0.92      1041
           3       1.00      0.85      0.92       469

    accuracy                           0.89      2425
   macro avg       0.89      0.89      0.89      2425
weighted avg       0.90      0.89      0.89      2425



## **Arquitectura 2**

In [30]:
def create_simple_nn():  
    model = Sequential()
    model.add(Dense(100,  activation='relu', name="Input_layer"))
    model.add(Dense(100, activation='relu', name="Hidden_layer_1"))
    model.add(Dense(100, activation='relu', name="Hidden_layer_2"))
    model.add(Dense(50, activation='relu', name="Hidden_layer_3"))
    model.add(Dense(4, activation='softmax', name="Output_layer"))

    return model

In [31]:
snn_model = create_simple_nn()  
snn_model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc', 'mse'])  

In [32]:
def convert_to_categorical(a):
    b = np.zeros((a.size, a.max()+1))
    b[np.arange(a.size),a] = 1
    return b

y_train_categorical = convert_to_categorical(y_train)
y_test_categorical = convert_to_categorical(y_test)

In [33]:
snn = snn_model.fit(x=X_train, y=y_train_categorical, batch_size=100, epochs=10, validation_data=(X_test, y_test_categorical), shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
snn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_layer (Dense)         (None, 100)               10100     
                                                                 
 Hidden_layer_1 (Dense)      (None, 100)               10100     
                                                                 
 Hidden_layer_2 (Dense)      (None, 100)               10100     
                                                                 
 Hidden_layer_3 (Dense)      (None, 50)                5050      
                                                                 
 Output_layer (Dense)        (None, 4)                 204       
                                                                 
Total params: 35,554
Trainable params: 35,554
Non-trainable params: 0
_________________________________________________________________


In [35]:
snn_model.evaluate(X_test, y_test_categorical)



[0.33636176586151123, 0.8919587731361389, 0.04549537971615791]

In [36]:
snn_pred = snn_model.predict(X_test, batch_size=100, verbose=1) 
snn_predicted = np.argmax(snn_pred, axis=1) 



In [37]:
snn_report = classification_report(np.argmax(y_test_categorical, axis=1), snn_predicted)  
print(snn_report)

              precision    recall  f1-score   support

           0       0.83      1.00      0.90       423
           1       0.80      0.82      0.81       492
           2       0.93      0.90      0.92      1041
           3       1.00      0.85      0.92       469

    accuracy                           0.89      2425
   macro avg       0.89      0.89      0.89      2425
weighted avg       0.90      0.89      0.89      2425



## **Arquitectura 3**

In [38]:
def create_simple_nn():  
    model = Sequential()
    model.add(Dense(100,  activation='relu', name="Input_layer"))
    model.add(Dense(80, activation='relu', name="Hidden_layer_1"))
    model.add(Dense(80, activation='relu', name="Hidden_layer_2"))
    model.add(Dense(50, activation='relu', name="Hidden_layer_3"))
    model.add(Dense(4, activation='softmax', name="Output_layer"))

    return model

In [39]:
snn_model = create_simple_nn()  
snn_model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc', 'mse'])  

In [40]:
def convert_to_categorical(a):
    b = np.zeros((a.size, a.max()+1))
    b[np.arange(a.size),a] = 1
    return b

y_train_categorical = convert_to_categorical(y_train)
y_test_categorical = convert_to_categorical(y_test)

In [41]:
snn = snn_model.fit(x=X_train, y=y_train_categorical, batch_size=100, epochs=20, validation_data=(X_test, y_test_categorical), shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [42]:
snn_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_layer (Dense)         (None, 100)               10100     
                                                                 
 Hidden_layer_1 (Dense)      (None, 80)                8080      
                                                                 
 Hidden_layer_2 (Dense)      (None, 80)                6480      
                                                                 
 Hidden_layer_3 (Dense)      (None, 50)                4050      
                                                                 
 Output_layer (Dense)        (None, 4)                 204       
                                                                 
Total params: 28,914
Trainable params: 28,914
Non-trainable params: 0
_________________________________________________________________


In [43]:
snn_model.evaluate(X_test, y_test_categorical)



[0.29747289419174194, 0.8919587731361389, 0.04478830099105835]

In [44]:
snn_pred = snn_model.predict(X_test, batch_size=100, verbose=1) 
snn_predicted = np.argmax(snn_pred, axis=1) 



In [45]:
snn_report = classification_report(np.argmax(y_test_categorical, axis=1), snn_predicted)  
print(snn_report)

              precision    recall  f1-score   support

           0       0.83      1.00      0.90       423
           1       0.80      0.82      0.81       492
           2       0.93      0.90      0.92      1041
           3       1.00      0.85      0.92       469

    accuracy                           0.89      2425
   macro avg       0.89      0.89      0.89      2425
weighted avg       0.90      0.89      0.89      2425

