In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import time
import random
from helper_funcs import create_tensorboard_callback, create_checkpoint_callback, plot_loss_curves, compare_historys, unzip_data, calculate_results

In [3]:
# !wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
!curl -O https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
unzip_data("nlp_getting_started.zip", path_name='nlp_getting_started/')
!rm nlp_getting_started.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  593k  100  593k    0     0   306k      0  0:00:01  0:00:01 --:--:--  306k


In [3]:
train_df = pd.read_csv('nlp_getting_started/train.csv').sample(frac=1, random_state=42)
test_df = pd.read_csv('nlp_getting_started/test.csv')

In [4]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [5]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
train_df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [7]:
len(train_df), len(test_df)

(7613, 3263)

In [8]:
random_index = random.randint(0, len(train_df))
target = train_df['target'][random_index]
print(f"Index: {random_index}")
print(f"Target: {target}, {' (disaster)' if(target == 1) else ' (not a real disaster)'}")
print(f"Text:{train_df['text'][random_index]}\n")

Index: 4765
Target: 0,  (not a real disaster)
Text:#NowPlaying 'The Lightning Strike' de Snow Patrol de A Hundred Million Suns ? http://t.co/GrzcHkDF37



In [9]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df['text'].to_numpy(), train_df['target'].to_numpy(), test_size=0.1, random_state=42)

In [10]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(6851, 762, 6851, 762)

In [11]:
text_vectorization_default = tf.keras.layers.TextVectorization(
    max_tokens=None,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    output_mode="int",
    output_sequence_length=None,
)
text_vectorization_default.adapt(train_sentences)

2024-01-18 01:51:22.870336: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-01-18 01:51:22.870385: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-01-18 01:51:22.870414: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-01-18 01:51:22.870618: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-01-18 01:51:22.870851: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-01-18 01:51:23.107170: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [12]:
len(text_vectorization_default.get_vocabulary())

21056

In [13]:
max([len(i.split()) for i in train_sentences])

31

In [14]:
max_vocab_length = 10000
max_length = 15

text_vectorization = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

In [15]:
text_vectorization.adapt(train_sentences)

In [16]:
sample_sentence = "There's a flood in my street!"
text_vectorization([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [17]:
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
        \n\nVectorized version:")
text_vectorization([random_sentence])

Original text:
@DannyRaynard not bad personally I'd get rid of either hazard or aguero for a better striker than berahino        

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[   1,   34,  281, 4931,  508,   52, 3464,    6, 1416,  423,   53,
           1,   10,    3,  441]])>

In [18]:
text_vectorization.get_vocabulary()[:10]

['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is']

In [19]:
embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

In [20]:
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
        \n\nEmbedded version:")
embedding(text_vectorization([random_sentence]))

Original text:
Two giant cranes holding a bridge collapse into nearby homes http://t.co/OQpsvrGbJc        

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.04246541, -0.03204773,  0.04435645, ...,  0.04761431,
         -0.01852838,  0.03693471],
        [-0.01181426,  0.03521517,  0.0448539 , ...,  0.00038818,
         -0.00744265,  0.00199524],
        [ 0.03775359, -0.01973156,  0.04260187, ..., -0.04509607,
         -0.03660854,  0.01308036],
        ...,
        [-0.01545919,  0.04323978, -0.02930453, ...,  0.04829386,
          0.04872984,  0.01113299],
        [-0.01545919,  0.04323978, -0.02930453, ...,  0.04829386,
          0.04872984,  0.01113299],
        [-0.01545919,  0.04323978, -0.02930453, ...,  0.04829386,
          0.04872984,  0.01113299]]], dtype=float32)>

### Model 0: Naive Bayes (baseline)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB()),
])

model_0.fit(train_sentences, train_labels)

In [106]:
model_0.score(val_sentences, val_labels)

0.7926509186351706

In [22]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,

In [23]:
model_0_result = calculate_results(val_labels, baseline_preds)
model_0_result

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [149]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorization(inputs)
x = embedding(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

model_1.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_1 = model_1.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_1"),
        create_checkpoint_callback("checkpoints", "08_model_1"),
    ],
)


Saving TensorBoard log files to: tensorboard/08_model_1/20240117-173030
Saving model checkpoints to: checkpoints/08_model_1/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [150]:
model_1.evaluate(val_sentences, val_labels)



[0.4817487299442291, 0.7860892415046692]

In [151]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs[:10]



array([[0.2945783 ],
       [0.7794208 ],
       [0.99821305],
       [0.12164141],
       [0.11060482],
       [0.9374037 ],
       [0.9179274 ],
       [0.9927249 ],
       [0.9626024 ],
       [0.28204492]], dtype=float32)

In [152]:
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [172]:
model_1_result = calculate_results(val_labels, model_1_preds)
model_1_result

{'accuracy': 78.60892388451444,
 'precision': 0.7916211438846743,
 'recall': 0.7860892388451444,
 'f1': 0.782856754224963}

In [160]:
embed_weights = model_1.layers[2].get_weights()[0]
embed_weights.shape

(10000, 128)

In [162]:
vocabulary = text_vectorizer.get_vocabulary()
vocabulary

['',
 '[UNK]',
 'the',
 'a',
 'in',
 'to',
 'of',
 'and',
 'i',
 'is',
 'for',
 'on',
 'you',
 'my',
 'with',
 'it',
 'that',
 'at',
 'by',
 'this',
 'from',
 'be',
 'are',
 'was',
 'have',
 'like',
 'as',
 'up',
 'so',
 'just',
 'but',
 'me',
 'im',
 'your',
 'not',
 'amp',
 'out',
 'its',
 'will',
 'an',
 'no',
 'has',
 'fire',
 'after',
 'all',
 'when',
 'we',
 'if',
 'now',
 'via',
 'new',
 'more',
 'get',
 'or',
 'about',
 'what',
 'he',
 'people',
 'news',
 'been',
 'over',
 'one',
 'how',
 'dont',
 'they',
 'who',
 'into',
 'were',
 'do',
 'us',
 '2',
 'can',
 'video',
 'emergency',
 'there',
 'disaster',
 'than',
 'police',
 'would',
 'his',
 'still',
 'her',
 'some',
 'body',
 'storm',
 'crash',
 'burning',
 'suicide',
 'back',
 'man',
 'california',
 'why',
 'time',
 'them',
 'had',
 'buildings',
 'rt',
 'first',
 'cant',
 'see',
 'got',
 'day',
 'off',
 'our',
 'going',
 'nuclear',
 'know',
 'world',
 'bomb',
 'fires',
 'love',
 'killed',
 'go',
 'attack',
 'youtube',
 'dead

In [163]:
import io

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocabulary):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()


## Model 2: LSTM

In [166]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.LSTM(64)(x)
# x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

model_2.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_2 = model_2.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_2"),
        create_checkpoint_callback("checkpoints", "08_model_2"),
    ],
)

Saving TensorBoard log files to: tensorboard/08_model_2/20240117-182633
Saving model checkpoints to: checkpoints/08_model_2/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [167]:
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]




array([[0.01292462],
       [0.6812362 ],
       [0.99875116],
       [0.02365888],
       [0.00188603],
       [0.99676514],
       [0.10164104],
       [0.99936837],
       [0.99930346],
       [0.17050791]], dtype=float32)

In [168]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]


<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 0., 1., 1., 0.], dtype=float32)>

In [173]:
model_2_result = calculate_results(val_labels, model_2_preds)
model_2_result

{'accuracy': 77.42782152230971,
 'precision': 0.779472762048361,
 'recall': 0.7742782152230971,
 'f1': 0.770786705353106}

## Model 3: GRU

In [24]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GRU(64)(x)
# x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")

model_3.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_3 = model_3.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_3"),
        create_checkpoint_callback("checkpoints", "08_model_3"),
    ],
)

Saving TensorBoard log files to: tensorboard/08_model_3/20240118-015140
Saving model checkpoints to: checkpoints/08_model_3/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs[:10]



array([[0.07007099],
       [0.8953155 ],
       [0.99963236],
       [0.05023453],
       [0.00461387],
       [0.9992599 ],
       [0.9809295 ],
       [0.9995956 ],
       [0.99926335],
       [0.05231824]], dtype=float32)

In [26]:
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [27]:
model_3_result = calculate_results(val_labels, model_3_preds)
model_3_result

{'accuracy': 77.42782152230971,
 'precision': 0.7771676205309502,
 'recall': 0.7742782152230971,
 'f1': 0.7716950935205196}

## Model 4: Bidirectional RNN model

In [33]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    input_length=max_length,
)

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
# x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model_4 = tf.keras.Model(inputs, outputs)

model_4.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

history_4 = model_4.fit(
    x=train_sentences,
    y=train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
    callbacks=[
        create_tensorboard_callback("tensorboard", "08_model_4"),
        create_checkpoint_callback("checkpoints", "08_model_4"),
    ],
)

Saving TensorBoard log files to: tensorboard/08_model_4/20240118-020006
Saving model checkpoints to: checkpoints/08_model_4/checkpoint.ckpt
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [34]:
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]



array([[0.5253565 ],
       [0.89554745],
       [0.9991524 ],
       [0.0903252 ],
       [0.00521479],
       [0.97653097],
       [0.5513199 ],
       [0.9997787 ],
       [0.99946445],
       [0.2187153 ]], dtype=float32)

In [35]:
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([1., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [36]:
model_4_result = calculate_results(val_labels, model_4_preds)
model_4_result

{'accuracy': 75.98425196850394,
 'precision': 0.7618096125081139,
 'recall': 0.7598425196850394,
 'f1': 0.7573149475055201}

In [None]:
## Model 5: 