In [1]:
import re
import gensim
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix, classification_report

from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [2]:
# Paths to model files and data
dataset_path = 'simpsons_dataset.csv'
models_path = './Modelos'

In [3]:
# Load all embedding models
model_50 = gensim.models.Word2Vec.load(f'{models_path}/Simpsons_50_02.model')
model_100 = gensim.models.Word2Vec.load(f'{models_path}/Simpsons_100_02.model')
model_200 = gensim.models.Word2Vec.load(f'{models_path}/Simpsons_200_02.model')

In [4]:
# Read CSV data
data_raw = pd.read_csv(dataset_path)
data_raw

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
...,...,...
158309,Miss Hoover,I'm back.
158310,Miss Hoover,"You see, class, my Lyme disease turned out to ..."
158311,Miss Hoover,Psy-cho-so-ma-tic.
158312,Ralph Wiggum,Does that mean you were crazy?


# Preprocessing

In [5]:
def expand_contractions(document: str) -> str:
    """
    Replace all abbreviations with their corresponding expansion
    """
    document = re.sub(r"'cause", "because", document)
    document = re.sub(r"o'clock", "of the clock", document)
    document = re.sub(r"won\'t", "will not", document)
    document = re.sub(r"can\'t", "can not", document)
    document = re.sub(r"n\'t", " not", document)
    document = re.sub(r"\'re", " are", document)
    document = re.sub(r"\'s", " is", document)
    document = re.sub(r"\'d", " would", document)
    document = re.sub(r"\'ll", " will", document)
    document = re.sub(r"\'t", " not", document)
    document = re.sub(r"\'ve", " have", document)
    document = re.sub(r"\'m", " am", document)
    return document

In [6]:
def replace_numbers(document: str) -> str:
    """
    Replace number appearances with 'number'
    """
    # Case 1: Combination of numbers and letters (Eg. 2nd -> number)
    document = re.sub('[a-zA-Z]+[0-9]+[a-zA-Z]+', 'number', document)
    document = re.sub('[0-9]+[a-zA-Z]+|[a-zA-Z]+[0-9]+', 'number', document)
    # Case 2: Decimal numbers (Eg. 2.1 -> number)
    document = re.sub('[0-9]+\.+[0-9]+', 'number', document)
    # Case 3: Numbers between spaces (Eg. 220 888 -> number)
    document = re.sub('([0-9]+\s)*[0-9]+', 'number', document)
    # Case 4: One or more of the previous cases (Eg. number number -> number)
    document = re.sub('((number)+\s)*(number)+', 'number', document)
    return document

In [7]:
def preprocessing(document: str) -> list:
    """
    iterate over all words in document identifing the word and frecuency
    remove all the problematic characters over the word
    and return a dictionary with the word as the key and the frecuency as the value
    """
    document = document.lower()
    document = expand_contractions(document)
    document = replace_numbers(document)
    document = re.sub('[^A-Za-z]+', ' ', document)
    document = document.split()
    return document

In [8]:
def group_sentences(data: pd.DataFrame, characters: list, sentences_per_group: int) -> pd.DataFrame:
    """
    Groups sentences from the same character. Returns a compressed DataFrame consisting
    of sentences_per_group concatenated sentences for each character
    """
    out_df = pd.DataFrame()
    for character in characters:
        sentences_subset = data[data['raw_character_text'] == character]
        groups = []
        for i in range(int(np.ceil(sentences_subset.shape[0]/sentences_per_group))):
            groups.append(' '.join(sentences_subset[(sentences_per_group*i):(sentences_per_group*(i+1))]['spoken_words']))
        out_df = pd.concat([out_df, pd.DataFrame({'character': character, 'sentences': groups})])
    return out_df

In [9]:
def words_in_embeddings(sentence: str):
    """
    Returns True if all words in a sentence have an embedding representation, False if not 
    """
    for word in preprocessing(sentence):
        if word not in model_50.wv.key_to_index:
            return False
    return True

In [10]:
# Remove null values and duplicate rows
data = data_raw.dropna().drop_duplicates()
data

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
...,...,...
158309,Miss Hoover,I'm back.
158310,Miss Hoover,"You see, class, my Lyme disease turned out to ..."
158311,Miss Hoover,Psy-cho-so-ma-tic.
158312,Ralph Wiggum,Does that mean you were crazy?


In [11]:
# Remove sentences without vector representation
data_filtered = data[data['spoken_words'].apply(words_in_embeddings)]
data_filtered

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
...,...,...
158308,Miss Hoover,"Good morning, Lisa."
158309,Miss Hoover,I'm back.
158310,Miss Hoover,"You see, class, my Lyme disease turned out to ..."
158312,Ralph Wiggum,Does that mean you were crazy?


In [12]:
# Get most common characters
data_filtered['raw_character_text'].value_counts()

Homer Simpson         22745
Marge Simpson         11015
Bart Simpson          10768
Lisa Simpson           8645
Moe Szyslak            2241
                      ...  
SPOILED 2-YEAR-OLD        1
Sun                       1
Applicants                1
Statue of Liberty         1
6th Graders               1
Name: raw_character_text, Length: 5548, dtype: int64

In [13]:
# Filter dataset leaving the four most common characters
characters = ['Homer Simpson','Marge Simpson','Bart Simpson','Lisa Simpson']
data_filtered = data_filtered.query('raw_character_text in @characters')
data_filtered

Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
11,Lisa Simpson,Do you know where I could find him?
...,...,...
158299,Lisa Simpson,Can we have wine?
158301,Lisa Simpson,Can I have wine?
158303,Lisa Simpson,Does Bart have to be there?
158305,Lisa Simpson,Can we do it this week?


In [14]:
# Create new dataset with groups of five sentences
data_grouped = group_sentences(data_filtered, characters, 5)
data_grouped

Unnamed: 0,character,sentences
0,Homer Simpson,Never thrown a party? What about that big bash...
1,Homer Simpson,I don't think you realize what you're saying. ...
2,Homer Simpson,"No, no, no. I just wish I knew what to say. Al..."
3,Homer Simpson,"Come on, you're holding out on me. Think nothi..."
4,Homer Simpson,And is this Martin guy going to get to do anyt...
...,...,...
1724,Lisa Simpson,"No, no. You don't understand. When Mr. Bergstr..."
1725,Lisa Simpson,"Yes! Yes, Mr. Bergstrom? No. Homework's not my..."
1726,Lisa Simpson,When? Mr. Bergstrom! Ewww. Gross. Oh Lord. He ...
1727,Lisa Simpson,"Oh, Mom, that's wonderful. Can I find out his ..."


In [15]:
# Separate the data in training, test and validation sets
x_train_val, x_test, y_train_val, y_test = train_test_split(data_grouped['sentences'], data_grouped['character'], test_size=0.2, random_state=0)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.2, random_state=0)

In [16]:
# Count each characters' appearance in each set 
summary = pd.DataFrame([y_train.value_counts(), y_val.value_counts(), y_test.value_counts()], index=['train', 'validation', 'test'])
summary

Unnamed: 0,Homer Simpson,Marge Simpson,Bart Simpson,Lisa Simpson
train,2948,1416,1358,1084
validation,720,359,344,279
test,881,428,452,366


# Size 50 embeddings

In [17]:
def sentence_to_embedding_50(sentence: str):
    """
    Returns the element-wise mean of the embeddings that represent each word in a sentence
    """
    token_list = preprocessing(sentence)
    return np.mean(model_50.wv[token_list], axis=0)

In [18]:
# Transform sentences into vectors
x_train_50 = x_train.apply(sentence_to_embedding_50)
x_val_50 = x_val.apply(sentence_to_embedding_50)
x_test_50 = x_test.apply(sentence_to_embedding_50)
x_train_50

1133    [0.08307874, -0.34154454, -0.21555215, -0.6717...
877     [0.0988574, -0.113601886, -0.051913228, -1.012...
3829    [0.29998192, -0.33769158, -0.27057227, -0.2832...
1182    [0.3900361, -0.3936474, -0.17824244, -0.492518...
1518    [0.24746819, -0.43653318, -0.015143045, -0.550...
                              ...                        
3035    [0.44571173, -0.38923872, -0.15169472, -0.2831...
3604    [0.2916982, -0.0005521466, -0.17822812, -0.224...
3425    [0.22261555, -0.087218, -0.025895413, -0.42048...
3717    [0.40869293, -0.290884, -0.4478682, -0.7300225...
3266    [0.18771069, 0.02331811, -0.011993099, -0.3344...
Name: sentences, Length: 6806, dtype: object

In [19]:
# One-Hot encode labels
encoder = LabelBinarizer()
y_train_encoded = encoder.fit_transform(y_train)
y_val_encoded = encoder.transform(y_val)
y_test_encoded = encoder.transform(y_test)
y_train_encoded

array([[0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       ...,
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0]])

## Architecture 1

In [20]:
# Create the classification model
model = Sequential()
model.add(Dense(50,  activation='relu', name="Input_layer"))
model.add(Dense(25, activation='relu', name="Hidden_layer_1"))
model.add(Dense(25, activation='relu', name="Hidden_layer_2"))
model.add(Dense(4, activation='softmax', name="Output_layer"))

In [21]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc'])  

In [22]:
# Train the model
history = model.fit(x=np.asarray(x_train_50.to_list()).astype('float32'),
                    y=y_train_encoded,
                    batch_size=32, epochs=20,
                    validation_data=(np.asarray(x_val_50.to_list()).astype('float32'), y_val_encoded),
                    shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_layer (Dense)         (None, 50)                2550      
                                                                 
 Hidden_layer_1 (Dense)      (None, 25)                1275      
                                                                 
 Hidden_layer_2 (Dense)      (None, 25)                650       
                                                                 
 Output_layer (Dense)        (None, 4)                 104       
                                                                 
Total params: 4,579
Trainable params: 4,579
Non-trainable params: 0
_________________________________________________________________


In [24]:
# Return predictions - the chosen class is the one with the highest probability
y_pred_50 = model.predict(np.asarray(x_test_50.to_list()).astype('float32'))
y_pred_50 = to_categorical(np.argmax(y_pred_50, axis=1), 4)
y_pred_50



array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]], dtype=float32)

In [25]:
print(classification_report(y_test, encoder.inverse_transform(y_pred_50)))

               precision    recall  f1-score   support

 Bart Simpson       0.39      0.05      0.09       452
Homer Simpson       0.49      0.87      0.63       881
 Lisa Simpson       0.51      0.16      0.24       366
Marge Simpson       0.40      0.38      0.39       428

     accuracy                           0.47      2127
    macro avg       0.45      0.36      0.34      2127
 weighted avg       0.46      0.47      0.40      2127



## Architecture 2

In [26]:
# Create the classification model
model = Sequential()
model.add(Dense(50,  activation='relu', name="Input_layer"))
model.add(Dense(50, activation='relu', name="Hidden_layer_1"))
model.add(Dense(50, activation='relu', name="Hidden_layer_2"))
model.add(Dense(25, activation='relu', name="Hidden_layer_3"))
model.add(Dense(4, activation='softmax', name="Output_layer"))

In [27]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc'])  

In [28]:
# Train the model
history = model.fit(x=np.asarray(x_train_50.to_list()).astype('float32'),
                    y=y_train_encoded,
                    batch_size=32, epochs=20,
                    validation_data=(np.asarray(x_val_50.to_list()).astype('float32'), y_val_encoded),
                    shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [29]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_layer (Dense)         (None, 50)                2550      
                                                                 
 Hidden_layer_1 (Dense)      (None, 50)                2550      
                                                                 
 Hidden_layer_2 (Dense)      (None, 50)                2550      
                                                                 
 Hidden_layer_3 (Dense)      (None, 25)                1275      
                                                                 
 Output_layer (Dense)        (None, 4)                 104       
                                                                 
Total params: 9,029
Trainable params: 9,029
Non-trainable params: 0
_________________________________________________________________


In [30]:
# Return predictions - the chosen class is the one with the highest probability
y_pred_50 = model.predict(np.asarray(x_test_50.to_list()).astype('float32'))
y_pred_50 = to_categorical(np.argmax(y_pred_50, axis=1), 4)
y_pred_50



array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.]], dtype=float32)

In [31]:
print(classification_report(y_test, encoder.inverse_transform(y_pred_50)))

               precision    recall  f1-score   support

 Bart Simpson       0.36      0.16      0.22       452
Homer Simpson       0.50      0.83      0.62       881
 Lisa Simpson       0.39      0.32      0.35       366
Marge Simpson       0.53      0.18      0.27       428

     accuracy                           0.47      2127
    macro avg       0.45      0.37      0.37      2127
 weighted avg       0.46      0.47      0.42      2127



## Architecture 3

In [32]:
# Create the classification model
model = Sequential()
model.add(Dense(50,  activation='relu', name="Input_layer"))
model.add(Dense(40, activation='relu', name="Hidden_layer_1"))
model.add(Dense(40, activation='relu', name="Hidden_layer_2"))
model.add(Dense(25, activation='relu', name="Hidden_layer_3"))
model.add(Dense(4, activation='softmax', name="Output_layer"))

In [33]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc'])  

In [34]:
# Train the model
history = model.fit(x=np.asarray(x_train_50.to_list()).astype('float32'),
                    y=y_train_encoded,
                    batch_size=32, epochs=20,
                    validation_data=(np.asarray(x_val_50.to_list()).astype('float32'), y_val_encoded),
                    shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [35]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_layer (Dense)         (None, 50)                2550      
                                                                 
 Hidden_layer_1 (Dense)      (None, 40)                2040      
                                                                 
 Hidden_layer_2 (Dense)      (None, 40)                1640      
                                                                 
 Hidden_layer_3 (Dense)      (None, 25)                1025      
                                                                 
 Output_layer (Dense)        (None, 4)                 104       
                                                                 
Total params: 7,359
Trainable params: 7,359
Non-trainable params: 0
_________________________________________________________________


In [36]:
# Return predictions - the chosen class is the one with the highest probability
y_pred_50 = model.predict(np.asarray(x_test_50.to_list()).astype('float32'))
y_pred_50 = to_categorical(np.argmax(y_pred_50, axis=1), 4)
y_pred_50



array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]], dtype=float32)

In [37]:
print(classification_report(y_test, encoder.inverse_transform(y_pred_50)))

               precision    recall  f1-score   support

 Bart Simpson       0.38      0.14      0.20       452
Homer Simpson       0.51      0.80      0.62       881
 Lisa Simpson       0.42      0.22      0.29       366
Marge Simpson       0.42      0.38      0.40       428

     accuracy                           0.47      2127
    macro avg       0.43      0.38      0.38      2127
 weighted avg       0.45      0.47      0.43      2127



# Size 100 embeddings

In [38]:
def sentence_to_embedding_100(sentence: str):
    """
    Returns the element-wise mean of the embeddings that represent each word in a sentence
    """
    token_list = preprocessing(sentence)
    return np.mean(model_100.wv[token_list], axis=0)

In [39]:
# Transform sentences into vectors
x_train_100 = x_train.apply(sentence_to_embedding_100)
x_val_100 = x_val.apply(sentence_to_embedding_100)
x_test_100 = x_test.apply(sentence_to_embedding_100)
x_train_100

1133    [-0.19050829, 0.39607593, 0.18970492, 0.443213...
877     [-0.290738, 0.28118527, -0.20843536, 0.365131,...
3829    [-0.52621406, 0.64666843, 0.0261572, 0.3445141...
1182    [-0.3504744, 0.44219318, -0.16405015, 0.423895...
1518    [-0.13211192, 0.5161154, -0.07311637, 0.383568...
                              ...                        
3035    [0.039870866, 0.3191851, 0.14636753, 0.4339915...
3604    [-0.20343174, 0.56542456, -0.06081393, 0.50534...
3425    [-0.16546537, 0.32568675, 0.16236827, 0.265110...
3717    [-0.04070326, 0.59277475, -0.0049013845, 0.101...
3266    [-0.096700296, 0.23148116, -0.015666625, 0.432...
Name: sentences, Length: 6806, dtype: object

## Architecture 1

In [40]:
# Create the classification model
model = Sequential()
model.add(Dense(100,  activation='relu', name="Input_layer"))
model.add(Dense(50, activation='relu', name="Hidden_layer_1"))
model.add(Dense(50, activation='relu', name="Hidden_layer_2"))
model.add(Dense(4, activation='softmax', name="Output_layer"))

In [41]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc'])  

In [42]:
# Train the model
history = model.fit(x=np.asarray(x_train_100.to_list()).astype('float32'),
                    y=y_train_encoded,
                    batch_size=32, epochs=20,
                    validation_data=(np.asarray(x_val_100.to_list()).astype('float32'), y_val_encoded),
                    shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [43]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_layer (Dense)         (None, 100)               10100     
                                                                 
 Hidden_layer_1 (Dense)      (None, 50)                5050      
                                                                 
 Hidden_layer_2 (Dense)      (None, 50)                2550      
                                                                 
 Output_layer (Dense)        (None, 4)                 204       
                                                                 
Total params: 17,904
Trainable params: 17,904
Non-trainable params: 0
_________________________________________________________________


In [44]:
# Return predictions - the chosen class is the one with the highest probability
y_pred_100 = model.predict(np.asarray(x_test_100.to_list()).astype('float32'))
y_pred_100 = to_categorical(np.argmax(y_pred_100, axis=1), 4)
y_pred_100



array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]], dtype=float32)

In [45]:
print(classification_report(y_test, encoder.inverse_transform(y_pred_100)))

               precision    recall  f1-score   support

 Bart Simpson       0.44      0.42      0.43       452
Homer Simpson       0.60      0.71      0.65       881
 Lisa Simpson       0.44      0.43      0.44       366
Marge Simpson       0.54      0.38      0.44       428

     accuracy                           0.53      2127
    macro avg       0.51      0.48      0.49      2127
 weighted avg       0.53      0.53      0.52      2127



## Architecture 2

In [46]:
# Create the classification model
model = Sequential()
model.add(Dense(100,  activation='relu', name="Input_layer"))
model.add(Dense(100, activation='relu', name="Hidden_layer_1"))
model.add(Dense(100, activation='relu', name="Hidden_layer_2"))
model.add(Dense(50, activation='relu', name="Hidden_layer_3"))
model.add(Dense(4, activation='softmax', name="Output_layer"))

In [47]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc'])  

In [48]:
# Train the model
history = model.fit(x=np.asarray(x_train_100.to_list()).astype('float32'),
                    y=y_train_encoded,
                    batch_size=32, epochs=20,
                    validation_data=(np.asarray(x_val_100.to_list()).astype('float32'), y_val_encoded),
                    shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [49]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_layer (Dense)         (None, 100)               10100     
                                                                 
 Hidden_layer_1 (Dense)      (None, 100)               10100     
                                                                 
 Hidden_layer_2 (Dense)      (None, 100)               10100     
                                                                 
 Hidden_layer_3 (Dense)      (None, 50)                5050      
                                                                 
 Output_layer (Dense)        (None, 4)                 204       
                                                                 
Total params: 35,554
Trainable params: 35,554
Non-trainable params: 0
_________________________________________________________________


In [50]:
# Return predictions - the chosen class is the one with the highest probability
y_pred_100 = model.predict(np.asarray(x_test_100.to_list()).astype('float32'))
y_pred_100 = to_categorical(np.argmax(y_pred_100, axis=1), 4)
y_pred_100



array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]], dtype=float32)

In [51]:
print(classification_report(y_test, encoder.inverse_transform(y_pred_100)))

               precision    recall  f1-score   support

 Bart Simpson       0.56      0.16      0.25       452
Homer Simpson       0.58      0.76      0.65       881
 Lisa Simpson       0.53      0.31      0.39       366
Marge Simpson       0.43      0.63      0.51       428

     accuracy                           0.53      2127
    macro avg       0.53      0.47      0.45      2127
 weighted avg       0.54      0.53      0.50      2127



## Architecture 3

In [52]:
# Create the classification model
model = Sequential()
model.add(Dense(100,  activation='relu', name="Input_layer"))
model.add(Dense(80, activation='relu', name="Hidden_layer_1"))
model.add(Dense(80, activation='relu', name="Hidden_layer_2"))
model.add(Dense(50, activation='relu', name="Hidden_layer_3"))
model.add(Dense(4, activation='softmax', name="Output_layer"))

In [53]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc'])  

In [54]:
# Train the model
history = model.fit(x=np.asarray(x_train_100.to_list()).astype('float32'),
                    y=y_train_encoded,
                    batch_size=32, epochs=20,
                    validation_data=(np.asarray(x_val_100.to_list()).astype('float32'), y_val_encoded),
                    shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [55]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_layer (Dense)         (None, 100)               10100     
                                                                 
 Hidden_layer_1 (Dense)      (None, 80)                8080      
                                                                 
 Hidden_layer_2 (Dense)      (None, 80)                6480      
                                                                 
 Hidden_layer_3 (Dense)      (None, 50)                4050      
                                                                 
 Output_layer (Dense)        (None, 4)                 204       
                                                                 
Total params: 28,914
Trainable params: 28,914
Non-trainable params: 0
_________________________________________________________________


In [56]:
# Return predictions - the chosen class is the one with the highest probability
y_pred_100 = model.predict(np.asarray(x_test_100.to_list()).astype('float32'))
y_pred_100 = to_categorical(np.argmax(y_pred_100, axis=1), 4)
y_pred_100



array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]], dtype=float32)

In [57]:
print(classification_report(y_test, encoder.inverse_transform(y_pred_100)))

               precision    recall  f1-score   support

 Bart Simpson       0.48      0.32      0.38       452
Homer Simpson       0.61      0.67      0.64       881
 Lisa Simpson       0.52      0.41      0.46       366
Marge Simpson       0.46      0.60      0.52       428

     accuracy                           0.54      2127
    macro avg       0.52      0.50      0.50      2127
 weighted avg       0.54      0.54      0.53      2127



# Size 200 embeddings

In [58]:
def sentence_to_embedding_200(sentence: str):
    """
    Returns the element-wise mean of the embeddings that represent each word in a sentence
    """
    token_list = preprocessing(sentence)
    return np.mean(model_200.wv[token_list], axis=0)

In [59]:
# Transform sentences into vectors
x_train_200 = x_train.apply(sentence_to_embedding_200)
x_val_200 = x_val.apply(sentence_to_embedding_200)
x_test_200 = x_test.apply(sentence_to_embedding_200)
x_train_200

1133    [0.16244797, -0.007762286, -0.5801328, 0.44329...
877     [-0.3506315, 0.055515993, -0.6027178, 0.436567...
3829    [-0.122201025, 0.056866664, -0.8165569, 0.4008...
1182    [-0.03245688, -0.012517424, -0.5956196, 0.2783...
1518    [-0.060584597, -0.017061805, -0.63101166, 0.26...
                              ...                        
3035    [0.008413417, 0.15665813, -0.68438077, 0.27424...
3604    [-0.05165259, 0.30557153, -0.6113081, 0.275105...
3425    [0.01491911, -0.03480768, -0.5763194, 0.260999...
3717    [-0.06740085, -0.1158896, -0.5450491, 0.224405...
3266    [-0.030567076, 0.1971119, -0.54006016, 0.42608...
Name: sentences, Length: 6806, dtype: object

## Architecture 1

In [60]:
# Create the classification model
model = Sequential()
model.add(Dense(200,  activation='relu', name="Input_layer"))
model.add(Dense(100, activation='relu', name="Hidden_layer_1"))
model.add(Dense(100, activation='relu', name="Hidden_layer_2"))
model.add(Dense(4, activation='softmax', name="Output_layer"))

In [61]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc'])  

In [62]:
# Train the model
history = model.fit(x=np.asarray(x_train_200.to_list()).astype('float32'),
                    y=y_train_encoded,
                    batch_size=32, epochs=20,
                    validation_data=(np.asarray(x_val_200.to_list()).astype('float32'), y_val_encoded),
                    shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [63]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_layer (Dense)         (None, 200)               40200     
                                                                 
 Hidden_layer_1 (Dense)      (None, 100)               20100     
                                                                 
 Hidden_layer_2 (Dense)      (None, 100)               10100     
                                                                 
 Output_layer (Dense)        (None, 4)                 404       
                                                                 
Total params: 70,804
Trainable params: 70,804
Non-trainable params: 0
_________________________________________________________________


In [64]:
# Return predictions - the chosen class is the one with the highest probability
y_pred_200 = model.predict(np.asarray(x_test_200.to_list()).astype('float32'))
y_pred_200 = to_categorical(np.argmax(y_pred_200, axis=1), 4)
y_pred_200



array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]], dtype=float32)

In [65]:
print(classification_report(y_test, encoder.inverse_transform(y_pred_200)))

               precision    recall  f1-score   support

 Bart Simpson       0.59      0.26      0.36       452
Homer Simpson       0.56      0.85      0.68       881
 Lisa Simpson       0.64      0.30      0.41       366
Marge Simpson       0.53      0.53      0.53       428

     accuracy                           0.57      2127
    macro avg       0.58      0.48      0.49      2127
 weighted avg       0.58      0.57      0.53      2127



## Architecture 2

In [66]:
# Create the classification model
model = Sequential()
model.add(Dense(200,  activation='relu', name="Input_layer"))
model.add(Dense(200, activation='relu', name="Hidden_layer_1"))
model.add(Dense(200, activation='relu', name="Hidden_layer_2"))
model.add(Dense(100, activation='relu', name="Hidden_layer_3"))
model.add(Dense(4, activation='softmax', name="Output_layer"))

In [67]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc'])  

In [68]:
# Train the model
history = model.fit(x=np.asarray(x_train_200.to_list()).astype('float32'),
                    y=y_train_encoded,
                    batch_size=32, epochs=20,
                    validation_data=(np.asarray(x_val_200.to_list()).astype('float32'), y_val_encoded),
                    shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [69]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_layer (Dense)         (None, 200)               40200     
                                                                 
 Hidden_layer_1 (Dense)      (None, 200)               40200     
                                                                 
 Hidden_layer_2 (Dense)      (None, 200)               40200     
                                                                 
 Hidden_layer_3 (Dense)      (None, 100)               20100     
                                                                 
 Output_layer (Dense)        (None, 4)                 404       
                                                                 
Total params: 141,104
Trainable params: 141,104
Non-trainable params: 0
_________________________________________________________________


In [70]:
# Return predictions - the chosen class is the one with the highest probability
y_pred_200 = model.predict(np.asarray(x_test_200.to_list()).astype('float32'))
y_pred_200 = to_categorical(np.argmax(y_pred_200, axis=1), 4)
y_pred_200



array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)

In [71]:
print(classification_report(y_test, encoder.inverse_transform(y_pred_200)))

               precision    recall  f1-score   support

 Bart Simpson       0.60      0.29      0.39       452
Homer Simpson       0.56      0.86      0.68       881
 Lisa Simpson       0.68      0.26      0.37       366
Marge Simpson       0.53      0.53      0.53       428

     accuracy                           0.57      2127
    macro avg       0.59      0.48      0.49      2127
 weighted avg       0.58      0.57      0.53      2127



## Architecture 3

In [72]:
# Create the classification model
model = Sequential()
model.add(Dense(200,  activation='relu', name="Input_layer"))
model.add(Dense(160, activation='relu', name="Hidden_layer_1"))
model.add(Dense(160, activation='relu', name="Hidden_layer_2"))
model.add(Dense(100, activation='relu', name="Hidden_layer_3"))
model.add(Dense(4, activation='softmax', name="Output_layer"))

In [73]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc'])  

In [74]:
# Train the model
history = model.fit(x=np.asarray(x_train_200.to_list()).astype('float32'),
                    y=y_train_encoded,
                    batch_size=32, epochs=20,
                    validation_data=(np.asarray(x_val_200.to_list()).astype('float32'), y_val_encoded),
                    shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [75]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input_layer (Dense)         (None, 200)               40200     
                                                                 
 Hidden_layer_1 (Dense)      (None, 160)               32160     
                                                                 
 Hidden_layer_2 (Dense)      (None, 160)               25760     
                                                                 
 Hidden_layer_3 (Dense)      (None, 100)               16100     
                                                                 
 Output_layer (Dense)        (None, 4)                 404       
                                                                 
Total params: 114,624
Trainable params: 114,624
Non-trainable params: 0
_________________________________________________________________


In [76]:
# Return predictions - the chosen class is the one with the highest probability
y_pred_200 = model.predict(np.asarray(x_test_200.to_list()).astype('float32'))
y_pred_200 = to_categorical(np.argmax(y_pred_200, axis=1), 4)
y_pred_200



array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)

In [77]:
print(classification_report(y_test, encoder.inverse_transform(y_pred_200)))

               precision    recall  f1-score   support

 Bart Simpson       0.58      0.28      0.38       452
Homer Simpson       0.54      0.90      0.68       881
 Lisa Simpson       0.65      0.23      0.34       366
Marge Simpson       0.61      0.44      0.51       428

     accuracy                           0.56      2127
    macro avg       0.60      0.46      0.48      2127
 weighted avg       0.58      0.56      0.52      2127

