In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import time
import random
from helper_funcs import create_tensorboard_callback, create_checkpoint_callback, plot_loss_curves, compare_historys, unzip_data, calculate_results
import tensorflow_hub as hub

In [5]:
# !wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
!curl -O https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
unzip_data("nlp_getting_started.zip", path_name='nlp_getting_started/')
!rm nlp_getting_started.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  593k  100  593k    0     0  1814k      0 --:--:-- --:--:-- --:--:-- 1813k


In [6]:
train_df = pd.read_csv('nlp_getting_started/train.csv').sample(frac=1, random_state=42)
test_df = pd.read_csv('nlp_getting_started/test.csv')

In [7]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [8]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [9]:
train_df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [10]:
len(train_df), len(test_df)

(7613, 3263)

In [11]:
random_index = random.randint(0, len(train_df))
target = train_df['target'][random_index]
print(f"Index: {random_index}")
print(f"Target: {target}, {' (disaster)' if(target == 1) else ' (not a real disaster)'}")
print(f"Text:{train_df['text'][random_index]}\n")

Index: 3048
Target: 1,  (disaster)
Text:'There was a small earthquake in LA but don't worry Emily Rossum is fine' #difficultpeople is great



In [12]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df['text'].to_numpy(), train_df['target'].to_numpy(), test_size=0.1, random_state=42)

In [13]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(6851, 762, 6851, 762)

In [14]:
text_vectorization_default = tf.keras.layers.TextVectorization(
    max_tokens=None,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    output_mode="int",
    output_sequence_length=None,
)
text_vectorization_default.adapt(train_sentences)

2024-01-18 15:31:16.739491: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-01-18 15:31:16.739518: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-01-18 15:31:16.739527: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-01-18 15:31:16.739591: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-01-18 15:31:16.739825: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-01-18 15:31:16.854103: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [15]:
len(text_vectorization_default.get_vocabulary())

21056

In [16]:
max([len(i.split()) for i in train_sentences])

31

In [17]:
max_vocab_length = 10000
max_length = 15

text_vectorization = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

In [18]:
text_vectorization.adapt(train_sentences)

In [19]:
sample_sentence = "There's a flood in my street!"
text_vectorization([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [20]:
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
        \n\nVectorized version:")
text_vectorization([random_sentence])

Original text:
@lizbon @KidicalMassDC It's more of a structural breakdown. Or maybe a patience failure on their part.        

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[  1,   1,  37,  51,   6,   3, 384,   1,  53, 680,   3, 999, 320,
         11, 131]])>

In [21]:
text_vectorization.get_vocabulary()[:10]

['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is']

In [22]:
embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

In [23]:
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
        \n\nEmbedded version:")
embedding(text_vectorization([random_sentence]))

Original text:
I entered to #win the ENTIRE set of butterLONDON Lip Crayons via @be_ram0s. - Go enter! #bbloggers http://t.co/DsB3lDfuxU        

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.04233202, -0.03212961,  0.00704069, ..., -0.04695761,
          0.02042797, -0.02041524],
        [ 0.0066098 ,  0.00506943, -0.03385796, ...,  0.02842617,
          0.03705411, -0.04995061],
        [ 0.02861372,  0.04672357, -0.02850316, ..., -0.02490436,
         -0.00039848, -0.00875305],
        ...,
        [-0.02548779, -0.02951611,  0.04010404, ...,  0.0011493 ,
          0.01746242, -0.03864621],
        [ 0.04847072, -0.03766227, -0.0496028 , ..., -0.02253153,
          0.03967238, -0.04563466],
        [ 0.01242206, -0.03003116,  0.03178943, ..., -0.00290171,
          0.00995159, -0.02689688]]], dtype=float32)>

### Model 0: Naive Bayes (baseline)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB()),
])

model_0.fit(train_sentences, train_labels)

In [26]:
model_0.score(val_sentences, val_labels)

0.7926509186351706

In [27]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,

In [28]:
model_0_result = calculate_results(val_labels, baseline_preds)
model_0_result

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [29]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorization(inputs)
x = embedding(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

model_1.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_1 = model_1.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_1"),
        create_checkpoint_callback("checkpoints", "08_model_1"),
    ],
)


Saving TensorBoard log files to: tensorboard/08_model_1/20240118-153246
Saving model checkpoints to: checkpoints/08_model_1/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [26]:
model_1.evaluate(val_sentences, val_labels)



[0.4795877933502197, 0.7860892415046692]

In [27]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs[:10]



array([[0.377892  ],
       [0.6727631 ],
       [0.9976393 ],
       [0.10367664],
       [0.17349124],
       [0.93533397],
       [0.91008604],
       [0.9928075 ],
       [0.9630222 ],
       [0.26276284]], dtype=float32)

In [28]:
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [29]:
model_1_result = calculate_results(val_labels, model_1_preds)
model_1_result

{'accuracy': 78.60892388451444,
 'precision': 0.7907394181632303,
 'recall': 0.7860892388451444,
 'f1': 0.7831536805930754}

In [30]:
embed_weights = model_1.layers[2].get_weights()[0]
embed_weights.shape

(10000, 128)

In [31]:
vocabulary = text_vectorizer.get_vocabulary()
vocabulary

['',
 '[UNK]',
 'the',
 'a',
 'in',
 'to',
 'of',
 'and',
 'i',
 'is',
 'for',
 'on',
 'you',
 'my',
 'with',
 'it',
 'that',
 'at',
 'by',
 'this',
 'from',
 'be',
 'are',
 'was',
 'have',
 'like',
 'as',
 'up',
 'so',
 'just',
 'but',
 'me',
 'im',
 'your',
 'not',
 'amp',
 'out',
 'its',
 'will',
 'an',
 'no',
 'has',
 'fire',
 'after',
 'all',
 'when',
 'we',
 'if',
 'now',
 'via',
 'new',
 'more',
 'get',
 'or',
 'about',
 'what',
 'he',
 'people',
 'news',
 'been',
 'over',
 'one',
 'how',
 'dont',
 'they',
 'who',
 'into',
 'were',
 'do',
 'us',
 '2',
 'can',
 'video',
 'emergency',
 'there',
 'disaster',
 'than',
 'police',
 'would',
 'his',
 'still',
 'her',
 'some',
 'body',
 'storm',
 'crash',
 'burning',
 'suicide',
 'back',
 'man',
 'california',
 'why',
 'time',
 'them',
 'had',
 'buildings',
 'rt',
 'first',
 'cant',
 'see',
 'got',
 'day',
 'off',
 'our',
 'going',
 'nuclear',
 'know',
 'world',
 'bomb',
 'fires',
 'love',
 'killed',
 'go',
 'attack',
 'youtube',
 'dead

In [32]:
import io

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocabulary):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()


## Model 2: LSTM

In [33]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.LSTM(64)(x)
# x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

model_2.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_2 = model_2.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_2"),
        create_checkpoint_callback("checkpoints", "08_model_2"),
    ],
)

Saving TensorBoard log files to: tensorboard/08_model_2/20240118-145002
Saving model checkpoints to: checkpoints/08_model_2/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [34]:
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]




array([[0.00574437],
       [0.782921  ],
       [0.9996877 ],
       [0.0323944 ],
       [0.00319531],
       [0.9997781 ],
       [0.96810436],
       [0.99984884],
       [0.9997718 ],
       [0.12086073]], dtype=float32)

In [35]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]


<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [36]:
model_2_result = calculate_results(val_labels, model_2_preds)
model_2_result

{'accuracy': 76.50918635170603,
 'precision': 0.7664434345240916,
 'recall': 0.7650918635170604,
 'f1': 0.7630272521222509}

## Model 3: GRU

In [37]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GRU(64)(x)
# x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")

model_3.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_3 = model_3.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_3"),
        create_checkpoint_callback("checkpoints", "08_model_3"),
    ],
)

Saving TensorBoard log files to: tensorboard/08_model_3/20240118-145022
Saving model checkpoints to: checkpoints/08_model_3/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [38]:
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs[:10]



array([[0.6367205 ],
       [0.9633442 ],
       [0.9949713 ],
       [0.03364241],
       [0.00682322],
       [0.99315506],
       [0.89855814],
       [0.99770504],
       [0.9949767 ],
       [0.2781296 ]], dtype=float32)

In [39]:
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([1., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [40]:
model_3_result = calculate_results(val_labels, model_3_preds)
model_3_result

{'accuracy': 77.03412073490814,
 'precision': 0.7711671866902318,
 'recall': 0.7703412073490814,
 'f1': 0.7686901866564684}

## Model 4: Bidirectional RNN model

In [41]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
# x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_4 = tf.keras.Model(inputs, outputs)

model_4.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_4 = model_4.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_4"),
        create_checkpoint_callback("checkpoints", "08_model_4"),
    ],
)

Saving TensorBoard log files to: tensorboard/08_model_4/20240118-145042
Saving model checkpoints to: checkpoints/08_model_4/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [42]:
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]



array([[3.0838463e-02],
       [5.6173795e-01],
       [9.9640393e-01],
       [1.0960973e-01],
       [6.8185700e-04],
       [9.9276817e-01],
       [5.0994414e-01],
       [9.9913341e-01],
       [9.9856335e-01],
       [2.0979431e-01]], dtype=float32)

In [43]:
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [44]:
model_4_result = calculate_results(val_labels, model_4_preds)
model_4_result

{'accuracy': 75.8530183727034,
 'precision': 0.7625739943565455,
 'recall': 0.7585301837270341,
 'f1': 0.7549664582215014}

## Model 5: Cov1D

In [45]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation="relu")(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_5 = tf.keras.Model(inputs, outputs)

model_5.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_5 = model_5.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_5"),
        create_checkpoint_callback("checkpoints", "08_model_5"),
    ],
)

model_5_preds = tf.squeeze(tf.round(model_5.predict(val_sentences)))
model_5_result = calculate_results(val_labels, model_5_preds)
model_5_result

Saving TensorBoard log files to: tensorboard/08_model_5/20240118-145110
Saving model checkpoints to: checkpoints/08_model_5/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 76.77165354330708,
 'precision': 0.768489862704666,
 'recall': 0.7677165354330708,
 'f1': 0.7660466459325422}

## Model 6: TensorFlow Hub Pretrained Sentence Encoder

In [46]:
embed = hub.load(
    "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2"
)
embeddings = embed(
    [
        "The quick brown fox jumps over the lazy dog.",
        "I am a sentence for which I would like to get its embedding",
    ]
)
embeddings

<tf.Tensor: shape=(2, 512), dtype=float32, numpy=
array([[-0.03133017, -0.06338633, -0.01607502, ..., -0.0324278 ,
        -0.04575741,  0.05370455],
       [ 0.05080861, -0.01652432,  0.01573779, ...,  0.00976656,
         0.0317012 ,  0.01788118]], dtype=float32)>

In [25]:
sentence_encoder_layer = hub.KerasLayer(
    "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
    input_shape=[],
    dtype=tf.string,
    trainable=False,
    name="USE",
)



model_6 = tf.keras.Sequential([
    sentence_encoder_layer,
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
], name="model_6_USE")

model_6.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_6 = model_6.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_6"),
        create_checkpoint_callback("checkpoints", "08_model_6"),
    ],
)

model_6_preds = tf.squeeze(tf.round(model_6.predict(val_sentences)))
model_6_result = calculate_results(val_labels, model_6_preds)
model_6_result

Saving TensorBoard log files to: tensorboard/08_model_6/20240118-153133
Saving model checkpoints to: checkpoints/08_model_6/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 81.36482939632546,
 'precision': 0.8152128321955759,
 'recall': 0.8136482939632546,
 'f1': 0.8123558654721007}

In [30]:
model_6.summary()

Model: "model_6_USE"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense (Dense)               (None, 64)                32832     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256830721 (979.73 MB)
Trainable params: 32897 (128.50 KB)
Non-trainable params: 256797824 (979.61 MB)
_________________________________________________________________


## Model 7: TransferLearning with 10% of training data

In [31]:
# train_data_10 = train_df[["text", "target"]].sample(frac=0.1, random_state=0)
# len(train_data_10)

train_data_10_split = int(0.1 * len(train_df))
train_sentences_10 = train_sentences[:train_data_10_split]
train_labels_10 = train_labels[:train_data_10_split]


In [32]:
pd.Series(train_labels_10).value_counts()

0    444
1    317
Name: count, dtype: int64

In [33]:
sentence_encoder_layer = hub.KerasLayer(
    "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
    input_shape=[],
    dtype=tf.string,
    trainable=False,
    name="USE",
)

model_7 = tf.keras.Sequential([
    sentence_encoder_layer,
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
], name="model_7_USE")

model_7.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_7 = model_7.fit(
    x=train_sentences_10,
    y=train_labels_10,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_7"),
        create_checkpoint_callback("checkpoints", "08_model_7"),
    ],
)

model_7_preds = tf.squeeze(tf.round(model_7.predict(val_sentences)))
model_7_result = calculate_results(val_labels, model_7_preds)
model_7_result

Saving TensorBoard log files to: tensorboard/08_model_7/20240118-153309
Saving model checkpoints to: checkpoints/08_model_7/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 78.08398950131233,
 'precision': 0.7813169326988488,
 'recall': 0.7808398950131233,
 'f1': 0.7795856165810638}

In [34]:
all_model_results = pd.DataFrame(
    {
        "0_baseline": model_0_result,
        "1_dense": model_1_result,
        "2_LSTM": model_2_result,
        "3_GRU": model_3_result,
        "4_Bidirectional": model_4_result,
        "5_Conv1D": model_5_result,
        "6_USE": model_6_result,
        "7_USE_10_percent": model_7_result,
    }
).transpose()
all_model_results["accuracy"] = all_model_results["accuracy"] / 100
all_model_results = all_model_results.sort_values("accuracy", ascending=False)
all_model_results

NameError: name 'model_1_result' is not defined

In [35]:
all_model_results.plot.bar(figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0));

NameError: name 'all_model_results' is not defined

In [36]:
from helper_funcs import pred_timer

model_6_total_time, model_6_time_per_pred = pred_timer(model_6, val_sentences)
model_6_total_time, model_6_time_per_pred



(7.2265689999985625, 9.48368635170415)

In [37]:
model_0_total_time, model_0_time_per_pred = pred_timer(model_0, val_sentences)
model_0_total_time, model_0_time_per_pred

(0.04147825000109151, 0.05443339895156366)

In [47]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.LSTM(64, return_sequences=True)(x)
x = tf.keras.layers.LSTM(64)(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_8 = tf.keras.Model(inputs, outputs)

model_8.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_8 = model_8.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_8"),
        create_checkpoint_callback("checkpoints", "08_model_8"),
    ],
)

model_8_preds = tf.squeeze(tf.round(model_8.predict(val_sentences)))
model_8_result = calculate_results(val_labels, model_8_preds)
model_8_result

Saving TensorBoard log files to: tensorboard/08_model_8/20240118-183630
Saving model checkpoints to: checkpoints/08_model_8/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 75.06561679790026,
 'precision': 0.7555035947814924,
 'recall': 0.7506561679790026,
 'f1': 0.7464335197069463}

In [48]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.LSTM(64)(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_9 = tf.keras.Model(inputs, outputs)

model_9.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_9 = model_9.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_9"),
        create_checkpoint_callback("checkpoints", "08_model_9"),
    ],
)

model_9_preds = tf.squeeze(tf.round(model_9.predict(val_sentences)))
model_9_result = calculate_results(val_labels, model_9_preds)
model_9_result

Saving TensorBoard log files to: tensorboard/08_model_9/20240118-183717
Saving model checkpoints to: checkpoints/08_model_9/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 76.50918635170603,
 'precision': 0.7648238707003729,
 'recall': 0.7650918635170604,
 'f1': 0.7642522657439998}

In [49]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_10 = tf.keras.Model(inputs, outputs)

model_10.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_10 = model_10.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_10"),
        create_checkpoint_callback("checkpoints", "08_model_10"),
    ],
)

model_10_preds = tf.squeeze(tf.round(model_10.predict(val_sentences)))
model_10_result = calculate_results(val_labels, model_10_preds)
model_10_result

Saving TensorBoard log files to: tensorboard/08_model_10/20240118-183750
Saving model checkpoints to: checkpoints/08_model_10/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 76.77165354330708,
 'precision': 0.7677367335034279,
 'recall': 0.7677165354330708,
 'f1': 0.766597011860758}

In [51]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation="relu")(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_11 = tf.keras.Model(inputs, outputs)

model_11.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_11 = model_11.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_11"),
        create_checkpoint_callback("checkpoints", "08_model_11"),
    ],
)

model_11_preds = tf.squeeze(tf.round(model_11.predict(val_sentences)))
model_11_result = calculate_results(val_labels, model_11_preds)
model_11_result

Saving TensorBoard log files to: tensorboard/08_model_11/20240118-183843
Saving model checkpoints to: checkpoints/08_model_11/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 76.64041994750657,
 'precision': 0.7665895370389821,
 'recall': 0.7664041994750657,
 'f1': 0.7651213533864446}

In [53]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation="relu")(x)
x = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation="relu")(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_12 = tf.keras.Model(inputs, outputs)

model_12.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_12 = model_12.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_12"),
        create_checkpoint_callback("checkpoints", "08_model_12"),
    ],
)

model_12_preds = tf.squeeze(tf.round(model_12.predict(val_sentences)))
model_12_result = calculate_results(val_labels, model_12_preds)
model_12_result

Saving TensorBoard log files to: tensorboard/08_model_12/20240118-183924
Saving model checkpoints to: checkpoints/08_model_12/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 76.24671916010499,
 'precision': 0.7629611993882945,
 'recall': 0.7624671916010499,
 'f1': 0.7608791530897157}

In [54]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation="relu")(x)
x = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation="relu")(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_13 = tf.keras.Model(inputs, outputs)

model_13.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_13 = model_13.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_13"),
        create_checkpoint_callback("checkpoints", "08_model_13"),
    ],
)

model_13_preds = tf.squeeze(tf.round(model_13.predict(val_sentences)))
model_13_result = calculate_results(val_labels, model_13_preds)
model_13_result

Saving TensorBoard log files to: tensorboard/08_model_13/20240118-184002
Saving model checkpoints to: checkpoints/08_model_13/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 75.8530183727034,
 'precision': 0.7622010920991931,
 'recall': 0.7585301837270341,
 'f1': 0.7551340463755609}

In [55]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=512,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation="relu")(x)
x = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation="relu")(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_14 = tf.keras.Model(inputs, outputs)

model_14.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_14 = model_14.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_14"),
        create_checkpoint_callback("checkpoints", "08_model_14"),
    ],
)

model_14_preds = tf.squeeze(tf.round(model_14.predict(val_sentences)))
model_14_result = calculate_results(val_labels, model_14_preds)
model_14_result

Saving TensorBoard log files to: tensorboard/08_model_14/20240118-184031
Saving model checkpoints to: checkpoints/08_model_14/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 76.50918635170603,
 'precision': 0.7752788386481628,
 'recall': 0.7650918635170604,
 'f1': 0.7594804747767185}

In [57]:
max_vocab_length = 10000
max_length = 25
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=512,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation="relu")(x)
x = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation="relu")(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_15 = tf.keras.Model(inputs, outputs)

model_15.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_15 = model_15.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_15"),
        create_checkpoint_callback("checkpoints", "08_model_15"),
    ],
)

model_15_preds = tf.squeeze(tf.round(model_15.predict(val_sentences)))
model_15_result = calculate_results(val_labels, model_15_preds)
model_15_result

Saving TensorBoard log files to: tensorboard/08_model_15/20240118-184152
Saving model checkpoints to: checkpoints/08_model_15/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 74.54068241469817,
 'precision': 0.7463476580352829,
 'recall': 0.7454068241469817,
 'f1': 0.7456923847878935}