In [1]:
import re
import spacy
import gensim
import logging
import pandas as pd
import numpy as np
# remove warnings
logging.basicConfig(level=logging.ERROR)
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# lectura de la data y eliminación de registros repetidos y nulos
documents = pd.read_csv('./datos/simpsons_dataset.csv').dropna().drop_duplicates()
documents = documents.reset_index(drop=True)
print(documents.shape)
display(documents.head())

(126646, 2)


Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [3]:
# Filtrado por personajes principales
main_characters =  ["Lisa Simpson", "Bart Simpson", "Homer Simpson", "Marge Simpson"]
documents = documents[documents["raw_character_text"].isin(main_characters)]
print(documents.shape)
display(documents.head())

(60610, 2)


Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!
8,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
10,Lisa Simpson,Do you know where I could find him?


In [4]:
def get_sublist(list:list, n:int) -> list:
    """
    Obtener sublistas de una lista
    """
    return [list[i:i+n] for i in range(0, len(list), n)]


In [5]:
def get_vectors_characters(characters: pd.DataFrame) -> pd.DataFrame:
    """
    Extraer dialogos para todos los personajes en grupos de 5 oraciones
    """
    sentences_big = get_sublist(characters['spoken_words'].tolist(),5)
    df = pd.DataFrame({ 'spoken_words': sentences_big})
    for index,dialogue in df['spoken_words'].items():
        df.loc[index,"spoken_words_concatenated"] = ' '.join(dialogue)
    return df

In [6]:
documents_dialogue_concatenated = documents.groupby("raw_character_text")  \
                                    .apply(lambda x: get_vectors_characters(x)) \
                                    .reset_index("raw_character_text")

In [7]:
def convert_label(label:str) -> int:
    """ 
    Categoriza los labels
    """
    if label == "bart simpson":
        return 0
    elif label == "lisa simpson":
        return 1
    elif label == "homer simpson":
        return 2
    else : 
        return 3
   

In [8]:
documents_dialogue_concatenated["raw_character_text"] = documents_dialogue_concatenated["raw_character_text"].apply(lambda x: x.lower())
documents_dialogue_concatenated["raw_character_text"] = documents_dialogue_concatenated["raw_character_text"].apply(convert_label)
documents_dialogue_concatenated["spoken_words_concatenated"] = documents_dialogue_concatenated["spoken_words_concatenated"].apply(lambda x: x.lower())
documents_dialogue_concatenated.head()

Unnamed: 0,raw_character_text,spoken_words,spoken_words_concatenated
0,0,"[Victory party under the slide!, Hey, thanks f...","victory party under the slide! hey, thanks for..."
1,0,"[Somebody must have voted., Uh oh., I demand a...",somebody must have voted. uh oh. i demand a re...
2,0,"[Ah, Dad, if just me, Milhouse and Lewis had v...","ah, dad, if just me, milhouse and lewis had vo..."
3,0,"[Please Dad., What?, Yes sir., They're fightin...",please dad. what? yes sir. they're fighting in...
4,0,"[Dad, I have as much respect for you as I ever...","dad, i have as much respect for you as i ever ..."


In [9]:

sentences = documents_dialogue_concatenated['spoken_words_concatenated'].values
y = documents_dialogue_concatenated['raw_character_text'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.2, random_state=1)

y_train_categorical = tf.one_hot(y_train,4)
y_test_categorical = tf.one_hot(y_test,4)


train_dataset = tf.data.Dataset.from_tensor_slices((sentences_train,y_train_categorical))
test_dataset = tf.data.Dataset.from_tensor_slices((sentences_test,y_test_categorical))

text_dataset = tf.data.Dataset.from_tensor_slices((sentences_train))

In [10]:

print('Number of rows in the total set: {}'.format(sentences.shape[0]))
print('Number of rows of bart class: {}'.format(y[y == 0].shape[0]))
print('Number of rows of lisa class: {}'.format(y[y == 1].shape[0]))
print('Number of rows of homer class: {}'.format(y[y == 2].shape[0]))
print('Number of rows of marge class: {}'.format(y[y == 3].shape[0]))
print('---------------------------------------------')

print('Number of rows in the training set: {}'.format(sentences_train.shape[0]))
print('Number of rows of bart class: {}'.format(y_train[y_train == 0].shape[0]))
print('Number of rows of lisa class: {}'.format(y_train[y_train == 1].shape[0]))
print('Number of rows of homer class: {}'.format(y_train[y_train == 2].shape[0]))
print('Number of rows of marge class: {}'.format(y_train[y_train == 3].shape[0]))
print('---------------------------------------------')

print('Number of rows in the test set: {}'.format(sentences_test.shape[0]))

print('Number of rows of bart class: {}'.format(y_test[y_test == 0].shape[0]))
print('Number of rows of lisa class: {}'.format(y_test[y_test == 1].shape[0]))
print('Number of rows of homer class: {}'.format(y_test[y_test == 2].shape[0]))
print('Number of rows of marge class: {}'.format(y_test[y_test == 3].shape[0]))


Number of rows in the total set: 12123
Number of rows of bart class: 2414
Number of rows of lisa class: 2029
Number of rows of homer class: 5195
Number of rows of marge class: 2485
---------------------------------------------
Number of rows in the training set: 9698
Number of rows of bart class: 1913
Number of rows of lisa class: 1598
Number of rows of homer class: 4171
Number of rows of marge class: 2016
---------------------------------------------
Number of rows in the test set: 2425
Number of rows of bart class: 501
Number of rows of lisa class: 431
Number of rows of homer class: 1024
Number of rows of marge class: 469


# Arquitectura 1

## Vectorize Layer Binary

In [11]:
vectorize_layer_binary = TextVectorization(
    ngrams=None, 
    max_tokens=200, vocabulary=None,
    output_mode='binary', output_sequence_length=None, pad_to_max_tokens=True,
)

vectorize_layer_binary.adapt(text_dataset)


In [12]:
vocab_size = len(vectorize_layer_binary.get_vocabulary())
vocab_size

200

In [13]:
embedding_dim = 100

In [14]:
modelsequ1 = Sequential()
modelsequ1.add(vectorize_layer_binary)
modelsequ1.add(layers.Embedding(vocab_size, embedding_dim, name="embedding"))
modelsequ1.add(layers.GlobalAveragePooling1D())
modelsequ1.add(layers.Dense(50, activation='relu', name="Hidden_layer"))
modelsequ1.add(layers.Dense(4, activation='softmax', name="Output_layer"))
modelsequ1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc', 'mse']) 


In [15]:
batch_size = 100
epochs = 10

# Fit the model using the train and test datasets.
modelsequ1.fit(
    train_dataset.batch(batch_size),
    validation_data=test_dataset.batch(batch_size),
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25c6114d640>

In [16]:
modelsequ1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 200)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 200, 100)          20000     
                                                                 
 global_average_pooling1d (G  (None, 100)              0         
 lobalAveragePooling1D)                                          
                                                                 
 Hidden_layer (Dense)        (None, 50)                5050      
                                                                 
 Output_layer (Dense)        (None, 4)                 204       
                                                                 
Total params: 25,254
Trainable params: 25,254
Non-traina

In [17]:
modelsequ1.evaluate(sentences_test,y_test_categorical)



[1.3071997165679932, 0.4222680330276489, 0.1765207052230835]

In [18]:
pred = modelsequ1.predict(sentences_test, batch_size=100, verbose=1) 
predicted = np.argmax(pred, axis=1)  
report = classification_report(np.argmax(y_test_categorical, axis=1), predicted)  
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       501
           1       0.00      0.00      0.00       431
           2       0.42      1.00      0.59      1024
           3       0.00      0.00      0.00       469

    accuracy                           0.42      2425
   macro avg       0.11      0.25      0.15      2425
weighted avg       0.18      0.42      0.25      2425



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Vectorize Layer count

In [19]:
vectorize_layer_count = TextVectorization(
    ngrams=None, 
    max_tokens=200, vocabulary=None,
    output_mode='count', output_sequence_length=None, pad_to_max_tokens=True,
)

vectorize_layer_count.adapt(text_dataset)

In [20]:
embedding_dim = 100
vocab_size = len(vectorize_layer_count.get_vocabulary())
vocab_size

200

In [21]:
modelsequ2 = Sequential()
modelsequ2.add(vectorize_layer_count)
modelsequ2.add(layers.Embedding(vocab_size, embedding_dim, name="embedding"))
modelsequ2.add(layers.GlobalAveragePooling1D())
modelsequ2.add(layers.Dense(50, activation='relu', name="Hidden_layer"))
modelsequ2.add(layers.Dense(4, activation='softmax', name="Output_layer"))
modelsequ2.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc', 'mse']) 


In [22]:
# Fit the model using the train and test datasets.
batch_size = 100
epochs = 10

modelsequ2.fit(
    train_dataset.batch(batch_size),
    validation_data=test_dataset.batch(batch_size),
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25c62167e20>

In [23]:
modelsequ2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, 200)              0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 200, 100)          20000     
                                                                 
 global_average_pooling1d_1   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 Hidden_layer (Dense)        (None, 50)                5050      
                                                                 
 Output_layer (Dense)        (None, 4)                 204       
                                                                 
Total params: 25,254
Trainable params: 25,254
Non-trai

In [24]:
modelsequ2.evaluate(sentences_test,y_test_categorical)



[1.3163774013519287, 0.4222680330276489, 0.17770437896251678]

In [25]:
pred = modelsequ2.predict(sentences_test, batch_size=100, verbose=1) 
predicted = np.argmax(pred, axis=1)  
report = classification_report(np.argmax(y_test_categorical, axis=1), predicted)  
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       501
           1       0.00      0.00      0.00       431
           2       0.42      1.00      0.59      1024
           3       0.00      0.00      0.00       469

    accuracy                           0.42      2425
   macro avg       0.11      0.25      0.15      2425
weighted avg       0.18      0.42      0.25      2425



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Vectorize layer int

In [26]:
vectorize_layer_int = TextVectorization(
    ngrams=None, 
    max_tokens=200, vocabulary=None,
    output_mode='int', output_sequence_length=None,
)

vectorize_layer_int.adapt(text_dataset)

In [27]:
embedding_dim = 100
vocab_size = len(vectorize_layer_int.get_vocabulary())
vocab_size

200

In [28]:
modelsequ3 = Sequential()
modelsequ3.add(vectorize_layer_int)
modelsequ3.add(layers.Embedding(vocab_size, embedding_dim, name="embedding"))
modelsequ3.add(layers.GlobalAveragePooling1D())
modelsequ3.add(layers.Dense(50, activation='relu', name="Hidden_layer"))
modelsequ3.add(layers.Dense(4, activation='softmax', name="Output_layer"))
modelsequ3.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc', 'mse']) 

In [29]:
# Fit the model using the train and test datasets.
modelsequ3.fit(
    train_dataset.batch(batch_size),
    validation_data=test_dataset.batch(batch_size),
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25c62311760>

In [30]:

modelsequ3.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_2 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, None, 100)         20000     
                                                                 
 global_average_pooling1d_2   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 Hidden_layer (Dense)        (None, 50)                5050      
                                                                 
 Output_layer (Dense)        (None, 4)                 204       
                                                                 
Total params: 25,254
Trainable params: 25,254
Non-trai

In [31]:
modelsequ3.evaluate(sentences_test,y_test_categorical)



[1.314957857131958, 0.4222680330276489, 0.177505761384964]

In [32]:
pred = modelsequ3.predict(sentences_test, batch_size=100, verbose=1) 
predicted = np.argmax(pred, axis=1)  
report = classification_report(np.argmax(y_test_categorical, axis=1), predicted)  
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       501
           1       0.00      0.00      0.00       431
           2       0.42      1.00      0.59      1024
           3       0.00      0.00      0.00       469

    accuracy                           0.42      2425
   macro avg       0.11      0.25      0.15      2425
weighted avg       0.18      0.42      0.25      2425



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Arquitectura 2

## Vectorize Layer Binary

In [33]:
vectorize_layer_binary = TextVectorization(
    ngrams=None, 
    max_tokens=200, vocabulary=None,
    output_mode='binary', output_sequence_length=None, pad_to_max_tokens=True,
)

vectorize_layer_binary.adapt(text_dataset)


In [34]:
vocab_size = len(vectorize_layer_binary.get_vocabulary())
vocab_size

200

In [35]:
embedding_dim = 100

In [36]:
modelsequ1 = Sequential()
modelsequ1.add(vectorize_layer_binary)
modelsequ1.add(layers.Embedding(vocab_size, embedding_dim, name="embedding"))
modelsequ1.add(layers.GlobalAveragePooling1D())
modelsequ1.add(layers.Dense(50, activation='relu', name="Hidden_layer"))
modelsequ1.add(layers.Dense(50, activation='relu', name="Hidden_layer_2"))
modelsequ1.add(layers.Dense(4, activation='softmax', name="Output_layer"))
modelsequ1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc', 'mse']) 


In [37]:
batch_size = 100
epochs = 10

# Fit the model using the train and test datasets.
modelsequ1.fit(
    train_dataset.batch(batch_size),
    validation_data=test_dataset.batch(batch_size),
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25c67cdfb50>

In [38]:
modelsequ1.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (TextV  (None, 200)              0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 200, 100)          20000     
                                                                 
 global_average_pooling1d_3   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 Hidden_layer (Dense)        (None, 50)                5050      
                                                                 
 Hidden_layer_2 (Dense)      (None, 50)                2550      
                                                                 
 Output_layer (Dense)        (None, 4)                

In [39]:
modelsequ1.evaluate(sentences_test,y_test_categorical)



[1.3105006217956543, 0.4222680330276489, 0.1768207550048828]

In [40]:
pred = modelsequ1.predict(sentences_test, batch_size=100, verbose=1) 
predicted = np.argmax(pred, axis=1)  
report = classification_report(np.argmax(y_test_categorical, axis=1), predicted)  
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       501
           1       0.00      0.00      0.00       431
           2       0.42      1.00      0.59      1024
           3       0.00      0.00      0.00       469

    accuracy                           0.42      2425
   macro avg       0.11      0.25      0.15      2425
weighted avg       0.18      0.42      0.25      2425



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Vectorize Layer count

In [41]:
vectorize_layer_count = TextVectorization(
    ngrams=None, 
    max_tokens=200, vocabulary=None,
    output_mode='count', output_sequence_length=None, pad_to_max_tokens=True,
)

vectorize_layer_count.adapt(text_dataset)

In [42]:
embedding_dim = 100
vocab_size = len(vectorize_layer_count.get_vocabulary())
vocab_size

200

In [43]:
modelsequ2 = Sequential()
modelsequ2.add(vectorize_layer_count)
modelsequ2.add(layers.Embedding(vocab_size, embedding_dim, name="embedding"))
modelsequ2.add(layers.GlobalAveragePooling1D())
modelsequ2.add(layers.Dense(50, activation='relu', name="Hidden_layer"))
modelsequ2.add(layers.Dense(50, activation='relu', name="Hidden_layer_2"))
modelsequ2.add(layers.Dense(4, activation='softmax', name="Output_layer"))
modelsequ2.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc', 'mse']) 


In [44]:
# Fit the model using the train and test datasets.
batch_size = 100
epochs = 10

modelsequ2.fit(
    train_dataset.batch(batch_size),
    validation_data=test_dataset.batch(batch_size),
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25c69e4aa60>

In [45]:
modelsequ2.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_4 (TextV  (None, 200)              0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 200, 100)          20000     
                                                                 
 global_average_pooling1d_4   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 Hidden_layer (Dense)        (None, 50)                5050      
                                                                 
 Hidden_layer_2 (Dense)      (None, 50)                2550      
                                                                 
 Output_layer (Dense)        (None, 4)                

In [46]:
modelsequ2.evaluate(sentences_test,y_test_categorical)



[1.3160678148269653, 0.4222680330276489, 0.17766143381595612]

In [47]:
pred = modelsequ2.predict(sentences_test, batch_size=100, verbose=1) 
predicted = np.argmax(pred, axis=1)  
report = classification_report(np.argmax(y_test_categorical, axis=1), predicted)  
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       501
           1       0.00      0.00      0.00       431
           2       0.42      1.00      0.59      1024
           3       0.00      0.00      0.00       469

    accuracy                           0.42      2425
   macro avg       0.11      0.25      0.15      2425
weighted avg       0.18      0.42      0.25      2425



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Vectorize layer int

In [48]:
vectorize_layer_int = TextVectorization(
    ngrams=None, 
    max_tokens=200, vocabulary=None,
    output_mode='int', output_sequence_length=None,
)

vectorize_layer_int.adapt(text_dataset)

In [49]:
embedding_dim = 100
vocab_size = len(vectorize_layer_int.get_vocabulary())
vocab_size

200

In [50]:
modelsequ3 = Sequential()
modelsequ3.add(vectorize_layer_int)
modelsequ3.add(layers.Embedding(vocab_size, embedding_dim, name="embedding"))
modelsequ3.add(layers.GlobalAveragePooling1D())
modelsequ3.add(layers.Dense(50, activation='relu', name="Hidden_layer"))
modelsequ3.add(layers.Dense(50, activation='relu', name="Hidden_layer_2"))
modelsequ3.add(layers.Dense(4, activation='softmax', name="Output_layer"))
modelsequ3.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc', 'mse']) 

In [51]:
# Fit the model using the train and test datasets.
modelsequ3.fit(
    train_dataset.batch(batch_size),
    validation_data=test_dataset.batch(batch_size),
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25c6afcf5b0>

In [52]:

modelsequ3.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_5 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, None, 100)         20000     
                                                                 
 global_average_pooling1d_5   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 Hidden_layer (Dense)        (None, 50)                5050      
                                                                 
 Hidden_layer_2 (Dense)      (None, 50)                2550      
                                                                 
 Output_layer (Dense)        (None, 4)                

In [53]:
modelsequ3.evaluate(sentences_test,y_test_categorical)



[1.3156100511550903, 0.4222680330276489, 0.17761465907096863]

In [54]:
pred = modelsequ3.predict(sentences_test, batch_size=100, verbose=1) 
predicted = np.argmax(pred, axis=1)  
report = classification_report(np.argmax(y_test_categorical, axis=1), predicted)  
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       501
           1       0.00      0.00      0.00       431
           2       0.42      1.00      0.59      1024
           3       0.00      0.00      0.00       469

    accuracy                           0.42      2425
   macro avg       0.11      0.25      0.15      2425
weighted avg       0.18      0.42      0.25      2425



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Arquitectura 3

## Vectorize Layer Binary

In [55]:
vectorize_layer_binary = TextVectorization(
    ngrams=None, 
    max_tokens=200, vocabulary=None,
    output_mode='binary', output_sequence_length=None, pad_to_max_tokens=True,
)

vectorize_layer_binary.adapt(text_dataset)


In [56]:
vocab_size = len(vectorize_layer_binary.get_vocabulary())
vocab_size

200

In [57]:
embedding_dim = 100

In [58]:
modelsequ1 = Sequential()
modelsequ1.add(vectorize_layer_binary)
modelsequ1.add(layers.Embedding(vocab_size, embedding_dim, name="embedding"))
modelsequ1.add(layers.GlobalAveragePooling1D())
modelsequ1.add(layers.Dense(100, activation='relu', name="Hidden_layer"))
modelsequ1.add(layers.Dense(50, activation='relu', name="Hidden_layer_2"))
modelsequ1.add(layers.Dense(4, activation='softmax', name="Output_layer"))
modelsequ1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc', 'mse']) 


In [59]:
batch_size = 100
epochs = 10

# Fit the model using the train and test datasets.
modelsequ1.fit(
    train_dataset.batch(batch_size),
    validation_data=test_dataset.batch(batch_size),
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25c6248c640>

In [60]:
modelsequ1.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_6 (TextV  (None, 200)              0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 200, 100)          20000     
                                                                 
 global_average_pooling1d_6   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 Hidden_layer (Dense)        (None, 100)               10100     
                                                                 
 Hidden_layer_2 (Dense)      (None, 50)                5050      
                                                                 
 Output_layer (Dense)        (None, 4)                

In [61]:
modelsequ1.evaluate(sentences_test,y_test_categorical)



[1.3101179599761963, 0.4222680330276489, 0.17675313353538513]

In [62]:
pred = modelsequ1.predict(sentences_test, batch_size=100, verbose=1) 
predicted = np.argmax(pred, axis=1)  
report = classification_report(np.argmax(y_test_categorical, axis=1), predicted)  
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       501
           1       0.00      0.00      0.00       431
           2       0.42      1.00      0.59      1024
           3       0.00      0.00      0.00       469

    accuracy                           0.42      2425
   macro avg       0.11      0.25      0.15      2425
weighted avg       0.18      0.42      0.25      2425



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Vectorize Layer count

In [63]:
vectorize_layer_count = TextVectorization(
    ngrams=None, 
    max_tokens=200, vocabulary=None,
    output_mode='count', output_sequence_length=None, pad_to_max_tokens=True,
)

vectorize_layer_count.adapt(text_dataset)

In [64]:
embedding_dim = 100
vocab_size = len(vectorize_layer_count.get_vocabulary())
vocab_size

200

In [65]:
modelsequ2 = Sequential()
modelsequ2.add(vectorize_layer_count)
modelsequ2.add(layers.Embedding(vocab_size, embedding_dim, name="embedding"))
modelsequ2.add(layers.GlobalAveragePooling1D())
modelsequ2.add(layers.Dense(100, activation='relu', name="Hidden_layer"))
modelsequ2.add(layers.Dense(50, activation='relu', name="Hidden_layer_2"))
modelsequ2.add(layers.Dense(4, activation='softmax', name="Output_layer"))
modelsequ2.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc', 'mse']) 


In [66]:
# Fit the model using the train and test datasets.
batch_size = 100
epochs = 10

modelsequ2.fit(
    train_dataset.batch(batch_size),
    validation_data=test_dataset.batch(batch_size),
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25c6256db20>

In [67]:
modelsequ2.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_7 (TextV  (None, 200)              0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 200, 100)          20000     
                                                                 
 global_average_pooling1d_7   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 Hidden_layer (Dense)        (None, 100)               10100     
                                                                 
 Hidden_layer_2 (Dense)      (None, 50)                5050      
                                                                 
 Output_layer (Dense)        (None, 4)                

In [68]:
modelsequ2.evaluate(sentences_test,y_test_categorical)



[1.3160150051116943, 0.4222680330276489, 0.17766331136226654]

In [69]:
pred = modelsequ2.predict(sentences_test, batch_size=100, verbose=1) 
predicted = np.argmax(pred, axis=1)  
report = classification_report(np.argmax(y_test_categorical, axis=1), predicted)  
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       501
           1       0.00      0.00      0.00       431
           2       0.42      1.00      0.59      1024
           3       0.00      0.00      0.00       469

    accuracy                           0.42      2425
   macro avg       0.11      0.25      0.15      2425
weighted avg       0.18      0.42      0.25      2425



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Vectorize layer int

In [70]:
vectorize_layer_int = TextVectorization(
    ngrams=None, 
    max_tokens=200, vocabulary=None,
    output_mode='int', output_sequence_length=None,
)

vectorize_layer_int.adapt(text_dataset)

In [71]:
embedding_dim = 100
vocab_size = len(vectorize_layer_int.get_vocabulary())
vocab_size

200

In [72]:
modelsequ3 = Sequential()
modelsequ3.add(vectorize_layer_int)
modelsequ3.add(layers.Embedding(vocab_size, embedding_dim, name="embedding"))
modelsequ3.add(layers.GlobalAveragePooling1D())
modelsequ3.add(layers.Dense(100, activation='relu', name="Hidden_layer"))
modelsequ3.add(layers.Dense(50, activation='relu', name="Hidden_layer_2"))
modelsequ3.add(layers.Dense(4, activation='softmax', name="Output_layer"))
modelsequ3.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['acc', 'mse']) 

In [73]:
# Fit the model using the train and test datasets.
modelsequ3.fit(
    train_dataset.batch(batch_size),
    validation_data=test_dataset.batch(batch_size),
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25c6e49ac70>

In [74]:

modelsequ3.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_8 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, None, 100)         20000     
                                                                 
 global_average_pooling1d_8   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 Hidden_layer (Dense)        (None, 100)               10100     
                                                                 
 Hidden_layer_2 (Dense)      (None, 50)                5050      
                                                                 
 Output_layer (Dense)        (None, 4)                

In [75]:
modelsequ3.evaluate(sentences_test,y_test_categorical)



[1.3148270845413208, 0.4222680330276489, 0.17748606204986572]

In [76]:
pred = modelsequ3.predict(sentences_test, batch_size=100, verbose=1) 
predicted = np.argmax(pred, axis=1)  
report = classification_report(np.argmax(y_test_categorical, axis=1), predicted)  
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       501
           1       0.00      0.00      0.00       431
           2       0.42      1.00      0.59      1024
           3       0.00      0.00      0.00       469

    accuracy                           0.42      2425
   macro avg       0.11      0.25      0.15      2425
weighted avg       0.18      0.42      0.25      2425



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
