# Exercise 1
Rebuild , compile and train `model_1`, `model_2` and `model_5` using the `Keras Sequential API` instead of `Functional API`.

## Check the GPU

In [1]:
# Check for the GPU
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



## Get the helper functions

In [2]:
# Download the helper functions
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

# Import series of helper functions for the notebook
from helper_functions import unzip_data, create_tensorboard_callback

--2023-03-26 19:41:35--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2023-03-26 19:41:35 (92.8 MB/s) - ‘helper_functions.py’ saved [10246/10246]



## Get a text dataset

In [3]:
# Download the data
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

# Unzip data
unzip_data('nlp_getting_started.zip')

--2023-03-26 19:41:39--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.142.128, 74.125.195.128, 173.194.202.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.142.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2023-03-26 19:41:39 (114 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [4]:
import pandas as pd

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [5]:
# Shuffling the test dataset
train_data_shuffled = train_data.sample(frac=1, random_state=42)
train_data_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


### Split the data into training/validations sets

In [6]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_data_shuffled['text'],
                                                                            train_data_shuffled['target'],
                                                                            test_size=0.1,
                                                                            random_state=42)

In [7]:
# Check the lengths
len(train_sentences) == len(train_labels), len(val_sentences) == len(val_labels)

(True, True)

## Converting text data into numerical data

### Tokenization

In [8]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [9]:
# Setup the vectorization variables
max_vocab_length = 10000
max_length = 15

In [10]:
# Setup the TextVectorization 
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode='int', 
                                    output_sequence_length=max_length)

In [11]:
# Fit the text vectorizer to the training set
text_vectorizer.adapt(train_sentences)

In [12]:
# Create a sample set and vectorize it
sample_sentence = "We'd love the aliens to visit us!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[1924,  110,    2, 6238,    5, 1742,   69,    0,    0,    0,    0,
           0,    0,    0,    0]])>

In [13]:
# Checking the vocabulary
vocab = text_vectorizer.get_vocabulary()
print(f"Top 5 most used words: {vocab[:5]}")
print(f"Top 5 least used words: {vocab[-5:]}")

Top 5 most used words: ['', '[UNK]', 'the', 'a', 'in']
Top 5 least used words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


### Embedding

In [14]:
from tensorflow.keras import layers

# Create embedding layer
embedding = layers.Embedding(input_dim=max_vocab_length, # input shape
                             output_dim=128, # output shape
                             input_length=max_length) # input length

embedding

<keras.layers.core.embedding.Embedding at 0x7f1d351a5d00>

In [15]:
# Check if it works
print(f"Sample sentence: \n{sample_sentence}")
print(f"\nSample sentence vectorized: \n{text_vectorizer([sample_sentence])}")
print(f"\nSample sentence embedded: \n{embedding(text_vectorizer([sample_sentence]))}")

Sample sentence: 
We'd love the aliens to visit us!

Sample sentence vectorized: 
[[1924  110    2 6238    5 1742   69    0    0    0    0    0    0    0
     0]]

Sample sentence embedded: 
[[[ 0.00788132 -0.04829688 -0.01385916 ...  0.01228603 -0.01670926
    0.01227896]
  [ 0.04368892 -0.02601178  0.01097583 ... -0.02443379 -0.03107102
   -0.01516576]
  [ 0.02757699  0.0175313   0.03094289 ... -0.00958041 -0.03032207
   -0.0256814 ]
  ...
  [-0.00117085  0.03885223 -0.03981144 ...  0.03352464  0.02280668
   -0.02000021]
  [-0.00117085  0.03885223 -0.03981144 ...  0.03352464  0.02280668
   -0.02000021]
  [-0.00117085  0.03885223 -0.03981144 ...  0.03352464  0.02280668
   -0.02000021]]]


## Helper function to evaluate the models

In [16]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  # Calculate model accuracy
  accuracy = accuracy_score(y_true, y_pred)

  # Calculate precision, recall and f1-score (in 'weighted' mode)
  precision, recall, f1, _ = precision_recall_fscore_support(y_true,
                                                             y_pred,
                                                             average='weighted')
  
  results = {'accuracy': accuracy,
             'precision': precision,
             'recall': recall,
             'f1-score': f1}

  return results

In [17]:
# Create TensorBoard callback
from helper_functions import create_tensorboard_callback

SAVE_DIR = 'exercise_model_logs'

## Model 1 reproduction - simple dense model





In [18]:
# Create the model with Sequential API
model_1 = tf.keras.Sequential([
    layers.Input(shape=(1,), dtype=tf.string),
    text_vectorizer,
    embedding,
    layers.GlobalAveragePooling1D(),
    layers.Dense(1, activation='sigmoid')
], name='model_1_dense')

model_1.summary()

# Compile the model
model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
Non-trainable params: 0
_________________________________________________________________


In [19]:
# Fit the model 
model_1_history = model_1.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                    'model_1_dense_Sequential')],
                              verbose=2)

Saving TensorBoard log files to: exercise_model_logs/model_1_dense_Sequential/20230326-194142
Epoch 1/5
215/215 - 5s - loss: 0.6115 - accuracy: 0.6892 - val_loss: 0.5379 - val_accuracy: 0.7467 - 5s/epoch - 25ms/step
Epoch 2/5
215/215 - 3s - loss: 0.4423 - accuracy: 0.8206 - val_loss: 0.4727 - val_accuracy: 0.7848 - 3s/epoch - 14ms/step
Epoch 3/5
215/215 - 2s - loss: 0.3474 - accuracy: 0.8612 - val_loss: 0.4574 - val_accuracy: 0.7927 - 2s/epoch - 11ms/step
Epoch 4/5
215/215 - 2s - loss: 0.2846 - accuracy: 0.8905 - val_loss: 0.4688 - val_accuracy: 0.7940 - 2s/epoch - 11ms/step
Epoch 5/5
215/215 - 2s - loss: 0.2377 - accuracy: 0.9104 - val_loss: 0.4806 - val_accuracy: 0.7808 - 2s/epoch - 9ms/step


In [20]:
# Function to evaluate and predict
def evaluate_and_pred(model, data_to_predict, labels_to_predict):
  """
  Evaluates on data_to_predict and returns predictions as 1 or 0.
  """

  # Evaluate the model
  eval = model.evaluate(data_to_predict, labels_to_predict)
  print(f"Evaluation metrics: {eval}\n")

  # Get the predictions 
  pred_probs = model.predict(data_to_predict)
  preds = tf.squeeze(tf.round(pred_probs))

  print(f"Preds overview: {preds[:10]}")

  return preds

In [21]:
# Evaluation and preds
model_1_preds = evaluate_and_pred(model_1, val_sentences, val_labels)

Evaluation metrics: [0.4805794954299927, 0.7808399200439453]

Preds overview: [0. 1. 1. 0. 0. 1. 1. 1. 1. 0.]


In [23]:
# Calculate the results
model_1_results = calculate_results(val_labels, model_1_preds)
model_1_results


{'accuracy': 0.7808398950131233,
 'precision': 0.7809067257781027,
 'recall': 0.7808398950131233,
 'f1-score': 0.7798778043941109}

## Model 2 - LSTM

In [25]:
from tensorflow.keras import layers

# Create the model
model_2 = tf.keras.Sequential([
    layers.Input(shape=(1,), dtype=tf.string),
    text_vectorizer,
    embedding,
    layers.LSTM(units=64),
    layers.Dense(1, activation='sigmoid')
], name='model_2_lstm')

# Compile the model
model_2.compile(loss='binary_crossentropy',
                optimizer='Adam',
                metrics=['accuracy'])

# Check the summary
model_2.summary()

Model: "model_2_lstm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
_________________________________________________________________


In [26]:
# Fit the model
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     'model_2_lstm')],
                              verbose=2)

Saving TensorBoard log files to: exercise_model_logs/model_2_lstm/20230326-194835
Epoch 1/5
215/215 - 7s - loss: 0.2257 - accuracy: 0.9219 - val_loss: 0.5779 - val_accuracy: 0.7769 - 7s/epoch - 31ms/step
Epoch 2/5
215/215 - 4s - loss: 0.1572 - accuracy: 0.9415 - val_loss: 0.6731 - val_accuracy: 0.7835 - 4s/epoch - 17ms/step
Epoch 3/5
215/215 - 4s - loss: 0.1317 - accuracy: 0.9514 - val_loss: 0.6542 - val_accuracy: 0.7848 - 4s/epoch - 16ms/step
Epoch 4/5
215/215 - 4s - loss: 0.1062 - accuracy: 0.9588 - val_loss: 0.8094 - val_accuracy: 0.7769 - 4s/epoch - 18ms/step
Epoch 5/5
215/215 - 3s - loss: 0.0864 - accuracy: 0.9672 - val_loss: 0.8007 - val_accuracy: 0.7743 - 3s/epoch - 16ms/step


In [28]:
# Evaluate and predict
model_2_preds = evaluate_and_pred(model_2, val_sentences, val_labels)

Evaluation metrics: [0.8007400631904602, 0.7742782235145569]

Preds overview: [0. 1. 1. 0. 0. 1. 0. 1. 1. 0.]


In [30]:
# Calculate results
model_2_results = calculate_results(val_labels, model_2_preds)
model_2_results

{'accuracy': 0.7742782152230971,
 'precision': 0.7855294750736606,
 'recall': 0.7742782152230971,
 'f1-score': 0.7687856172080995}

## Model 5 - 1D Convolutional NN

In [32]:
# Create the model
model_5 = tf.keras.Sequential([
    layers.Input(shape=(1,), dtype=tf.string),
    text_vectorizer,
    embedding,
    layers.Conv1D(filters=64, 
                  kernel_size=5,
                  strides=1,
                  activation='relu',
                  padding='same'),
    layers.GlobalMaxPool1D(),
    layers.Dense(1, activation='sigmoid')
], name='model_5_conv1d')

# Compile the model
model_5.compile(loss='binary_crossentropy',
                optimizer='Adam',
                metrics=['accuracy'])

# Check the summary
model_5.summary()

Model: "model_5_conv1d"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 conv1d_1 (Conv1D)           (None, 15, 64)            41024     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,321,089
Trainable params: 1,321,089


In [33]:
# Fit the model
model_5_history = model_5.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     'model_5_conv1d')])

Saving TensorBoard log files to: exercise_model_logs/model_5_conv1d/20230326-195827
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [34]:
# Evaluate and predict
model_5_preds = evaluate_and_pred(model_5, val_sentences, val_labels)

Evaluation metrics: [1.0885984897613525, 0.7795275449752808]

Preds overview: [0. 1. 1. 0. 0. 1. 1. 1. 1. 1.]


In [36]:
# Calculate results
model_5_results = calculate_results(val_labels, model_5_preds)
model_5_results

{'accuracy': 0.7795275590551181,
 'precision': 0.780644760213944,
 'recall': 0.7795275590551181,
 'f1-score': 0.7778858484546237}

# Exercise 2
Retrain the baseline model with 10% of the training data. How does it perform to the Universal Sentence Encoder model with 10% of the training data?

In [37]:
# Getting the right amount of data for the split
train_10_percent_split = int(0.1 * len(train_sentences))

# Recreate the 10% training data split
train_sentences_10_percent = train_sentences[:train_10_percent_split]
train_labels_10_percent = train_labels[:train_10_percent_split]

# Check the split
print(f"Length of base set: {len(train_sentences)}")
print(f"Length of 10 percent set: {len(train_sentences_10_percent)}")

Length of base set: 6851
Length of 10 percent set: 685


In [40]:
# Recreating the baseline model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create pipeline with tokenization
model_0 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Fit the pipeline to 10% dataset
model_0.fit(train_sentences_10_percent, train_labels_10_percent)

In [43]:
# Evaluate and pred
model_0_score = model_0.score(val_sentences, val_labels)
print(f"Baseline model score: {model_0_score}")

model_0_preds = model_0.predict(val_sentences)
model_0_preds[:10]

Baseline model score: 0.7020997375328084


array([1, 0, 1, 0, 0, 1, 0, 0, 1, 0])

In [49]:
# Calculate results
model_0_results = calculate_results(val_labels, model_0_preds)
model_0_results

{'accuracy': 0.7020997375328084,
 'precision': 0.7599524002753854,
 'recall': 0.7020997375328084,
 'f1-score': 0.6736831571468213}

How did the model perform compared to USE model from `08` notebook?
* The USE model trained on 10% of the data did get about 77,5%.
* The retrained baseline model from `sklearn` with `MultinomialNB` classifier, also trained on about 70,2% accuracy.

**This shows that the smaller the dataset, maybe using the pretrained model from TF Hub is a good idea to include in the experiment workflow.**


# Exercise 3
Try fine-tuning the TF Hub USE (`model_6`) by setting `training=True` when instentiating it as a Keras layer.

In [50]:
# Recreate the USE model used in 08