In [None]:
#We need to install tf version 2.15 for compatibility with this notebook (written in March 2024).
!pip uninstall tensorflow
!pip install tensorflow==2.15.0
!tensorflow --version

# Introduction to NLP fundamentals in Tensorflow

NLP has the goal of deriving information out of natural language (could be sequences text or speech)

Another common term for NLP problems is sequence to sequence (seq2seq)

In [None]:
!nvidia-smi -L

/bin/bash: line 1: nvidia-smi: command not found


## Get helper functions


In [None]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2024-10-29 00:31:25--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2024-10-29 00:31:25 (63.0 MB/s) - ‘helper_functions.py’ saved [10246/10246]



In [None]:
# Import a series of helper functions for the notebook
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys, make_confusion_matrix

## Get a text dataset

The dataset we are going to be using is Kaggle's introduction to NLP dataset.  (Text samples of tweets labelled as disaster or not disaster).  

See the original source here: https://www.kaggle.com/c/nlp-getting-started/



In [None]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

unzip_data("nlp_getting_started.zip")

--2024-10-29 00:31:43--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.65.123, 172.217.15.251, 172.217.164.27, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.65.123|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2024-10-29 00:31:43 (75.8 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



## Visualizing a text dataset

To visualize our text samples, we first have to read them in.  https://realpython.com/read-write-files-python/

One way to do so would be to use Python, but I prefer to get visual straight away.

Another way is to use Pandas

In [None]:
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
#dads_df = pd.read_csv("tweets.csv", header=None)
#train_df.head()
#test_df.head()
#dads_list = dads_df.values.tolist()
#dads_list


In [None]:
train_df["text"][0],test_df["text"][0]

('Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 'Just happened a terrible car crash')

In [None]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

# Shuffle test dataframe
test_df_shuffled = test_df.sample(frac=1, random_state=42)
test_df_shuffled.head()

Unnamed: 0,id,keyword,location,text
2406,8051,refugees,,Refugees as citizens - The Hindu http://t.co/G...
134,425,apocalypse,Currently Somewhere On Earth,@5SOStag honestly he could say an apocalypse i...
411,1330,blown%20up,Scout Team,If you bored as shit don't nobody fuck wit you...
203,663,attack,,@RealTwanBrown Yesterday I Had A Heat Attack ?...
889,2930,danger,Leeds,The Devil Wears Prada is still one of my favou...


In [None]:
# What does the test dataframe look like?
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
# How many examples of each class?
train_df.target.value_counts()
#test_df.text.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


In [None]:
# How many total samples?
len(train_df), len(test_df)

(7613, 3263)

In [None]:
# Let's visualize some random training examples
import random
random_index = random.randint(0, len(train_df)-5) # create random indexes not higher than the total number of samples

for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print(f"---\n")

Target: 0 (not real disaster)
Text:
Downtown Emergency Service Center is hiring a #Chemical #Dependency Counselor or Intern apply now! #Seattle #jobs http://t.co/SKQPWSNOin

---

Target: 1 (real disaster)
Text:
Watch This Airport Get Swallowed Up By A Sandstorm In Under A Minute http://t.co/mkWyvM3i8r

---

Target: 0 (not real disaster)
Text:
I don't doubt it. But it was his implicit statement in doing it that makes me want him flattened by a bus. https://t.co/5hlJUcxI0S

---

Target: 0 (not real disaster)
Text:
Wow. #FIFA16 has Pre Season Tournaments in Career Mode. Bloody hell evacuate the building #whocares

---

Target: 0 (not real disaster)
Text:
@RVacchianoNYDN The only surprise is that they aren't ALL injured.

---



In [None]:
# Let's visualize some random test examples (TEST sentences)#
import random
random_index = random.randint(0, len(test_df)-5)

for row in test_df_shuffled[["location", "text"]][random_index:random_index+5].itertuples():
  _, location, text = row
  print(f"Text:\n{text}\n")
  print(f"---\n")

Text:
@EddieTrunk Blizzard of Ozz

---

Text:
hijack

---

Text:
Suicide bomber kills 15 in Saudi security site mosque - Reuters http://t.co/KCObrZBVDs http://t.co/y62HSFVIAQ

---

Text:
True strength is forgiveness. Love the most powerful weapon. .@vickysuewrites' Broken Circle'

#giveaway #boyxboy http://t.co/Zgc3EsLNPS

---

Text:
@justgetawayx everything will turn out fine!! I went to lp/om&amp;m alone and survived it and so can you

---



### Split data into training and validation sets

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
# Use train_test_split to split the data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1, # use 10% of training data for validation split
                                                                            random_state=42)

In [None]:
# predict on the TEST sentences
#my_prediction_probs = model_6.predict(test_df_shuffled["text"].to_numpy())


In [None]:
#my_prediction_probs

In [None]:
# Check the lengths
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [None]:
# Check the first 10 samples
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

## Converting text into numbers

When dealing with a text problem, one of the first things you will have to do before you can build a model, is to convert your text into numbers.  There are a few ways to do this, namely:
* Tokenization: direct mapping of token (a token could be a word or character) to a number
* Embedding: create an embedding of feature vectors for each token (the size of the feature vector can be defined and this embedding can be learned)

### Text vectorization (tokenization)

In [None]:
train_sentences[:5]

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
       'Somehow find you and I collide http://t.co/Ee8RpOahPk'],
      dtype=object)

In [None]:
import tensorflow as tf
from tf_keras.layers.experimental.preprocessing import TextVectorization

# Use the default TextVectorization parameters
text_vectorizer = TextVectorization(max_tokens=10000,  # how many words in the vocabulary? (automatically add <OOV>)
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None, # create groups of n-words
                                    output_mode="int", # how to map tokens to number
                                    output_sequence_length=None,  # how long do you want your sequences to be?
                                    pad_to_max_tokens=True)

In [None]:
len(train_sentences[0].split())

7

In [None]:
# Find the average number of tokens (words) in the training tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [None]:
# Setup text vectorization variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be (eg. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens = max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

# Create text vectorizer for TEST data set
text_vectorizer_test = TextVectorization(max_tokens = max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [None]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [None]:
# Fit the text vectorizer to the TEST text
#text_vectorizer_test.adapt(test_df_shuffled["text"].to_numpy())

In [None]:
# Create a sample sentence and tokenize it
sample_sentence = "This is a sample sjdfhakdjfasf sadfj asdkfjh askdjf sakjfh aks asdf asdf asdf asdf safdsadfsadf "
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[  19,    9,    3, 8839,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1]])>

In [None]:
# Choose a random sentence from the training dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Original text: {random_sentence}\n\
Vectorized version: {text_vectorizer([random_sentence])}")

Original text: Ted Cruz fires back at Jeb &amp; Bush: ÛÏWe lose because of Republicans like Jeb &amp; Mitt.Û [Video] -  http://t.co/KCofF6BmiE
Vectorized version: [[2264 1534  109   88   17 1828   35  657 3227 1505  152    6 2004   25
  1828]]


In [None]:
# Choose a random sentence from the TEST dataset and tokenize it
#random_sentence_test = random.choice(test_df_shuffled["text"].to_numpy())
#print(f"Original text: {random_sentence_test}\n\
#Vectorized version: {text_vectorizer_test([random_sentence_test])}")

In [None]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary() # Get all of the unique words in our training data
top_5_words = words_in_vocab[:5] # get the most common words
bottom_5_words = words_in_vocab[-5:] # get the least common words
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Most common 5 words: {top_5_words}")
print(f"Least common 5 words: {bottom_5_words}")

Number of words in vocab: 10000
Most common 5 words: ['', '[UNK]', 'the', 'a', 'in']
Least common 5 words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


In [None]:
sentences = test_df["text"]
sentences

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan
...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,Storm in RI worse than last hurricane. My city...
3260,Green Line derailment in Chicago http://t.co/U...
3261,MEG issues Hazardous Weather Outlook (HWO) htt...


In [None]:
# Get the unique words in the vocabulary (TEST)
words_in_vocab_test = text_vectorizer_test.get_vocabulary()
top_5_words_test = words_in_vocab_test[:5]
bottom_5_words_test = words_in_vocab_test[-5:]
print(f"Number of words in vocab: {len(words_in_vocab_test)}")
print(f"Most common 5 words {top_5_words_test}")
print(f"Least common 5 words: {bottom_5_words_test}")

Number of words in vocab: 2
Most common 5 words ['', '[UNK]']
Least common 5 words: ['', '[UNK]']


### Creating an embedding using an embedding layer

To make our embedding, we're going to use TensorFlow's embedding layer:
https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding

The parameters we care most about for our embedding layer:
* `input_dim` = the size of our vocabulary
* `output_dim` = the size of the output embedding vector, for example, a value of 100 would mean each token gets represented by a vector 100 long
* `input_length` = length of the sequences being passed to the embedding layer

In [None]:
from tf_keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length,  # set input shape
                             output_dim=128,    # output shape
                             input_length=max_length)    #how long is each input

embedding

<tf_keras.src.layers.core.embedding.Embedding at 0x7c5944447520>

In [None]:
# Get a random sentence from the training set
random_sentence = random.choice(train_sentences)
print(f"Original text: {random_sentence}\n\
      Embedded version:")

# Embed the random sentence (turn it into dense vectors of fixed size)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text: @SirTitan45  Mega mood swing on a 24 hr schedule. Isn't that how structural failure occurs?
      Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.0035265 , -0.00192551, -0.00049866, ..., -0.03761262,
         -0.00232654,  0.00720401],
        [-0.01537702, -0.0029817 ,  0.00035368, ..., -0.01856076,
         -0.02273378,  0.03584871],
        [-0.01172631, -0.01688895,  0.04851237, ..., -0.0493596 ,
          0.03826752, -0.02759098],
        ...,
        [-0.03915339, -0.00063073,  0.00277761, ...,  0.04219371,
         -0.0355489 ,  0.0086624 ],
        [ 0.02828783, -0.02011579,  0.01987657, ..., -0.03309675,
          0.01295484, -0.02917149],
        [-0.04228786,  0.00597602,  0.04027947, ...,  0.00243164,
          0.03805712, -0.02725112]]], dtype=float32)>

In [None]:
# Check out a single token's embedding
sample_embed[0][0], sample_embed[0][0].shape, random_sentence[0]

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([ 0.0035265 , -0.00192551, -0.00049866,  0.02369261, -0.04725368,
         0.03907185, -0.03496747,  0.00751869,  0.024002  , -0.04037524,
        -0.01281999, -0.01699594,  0.00696826,  0.00430919, -0.04398051,
         0.01848395,  0.04423733, -0.04345018,  0.00499805, -0.00670303,
         0.0337593 ,  0.04039404,  0.00048391,  0.01323633,  0.01760931,
         0.03256628,  0.04968696, -0.02814697, -0.00420803,  0.02528762,
         0.01410453, -0.02271535,  0.03196447,  0.03234461, -0.04269974,
        -0.03544481,  0.04603864,  0.03866308, -0.01587731,  0.04028252,
         0.00488121,  0.0450185 ,  0.00535149,  0.01476162, -0.02452957,
         0.03055633,  0.01507142, -0.00794442, -0.02879271,  0.00304198,
         0.04369305, -0.00062605,  0.0189379 , -0.04961009,  0.01405681,
        -0.01739249,  0.04697261, -0.04259024,  0.04902686, -0.02689931,
        -0.01898794,  0.01933109,  0.00152041, -0.04510957, -0.04238583,
  

## Modeling a text dataset (running a series of experiments)

Now we've got a way to turn our text sequences into numbers, it's time to start building a series of modeling experiments.  We'll start with a baseline and move on from there.

* Model 0: Naive Bayes (baseline) - this is from Scikit ML map: https://scikit-learn.org/stable/modules/naive_bayes.html https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
* Model 1: Feed-forward neural network (dense model)
* Model 2: LSTN model (RNN)
* Model 3: GRU model (RNN)
* Model 4: Bidirectional-LSTM model (RNN)
* Model 5: 1D Convolutional Neural Network (CNN)
* Model 6: TensorFlow Hub Pre-trained Feature Extractor (using transfer learning for NLP)
* Model 7: Same as model 6 with 10% of the training data.

How are we going to approach all of these?

Use the standard steps in modelling with TensorFlow:

* Create a model
* Build a model
* Fit model
* Evaluate



### Model 0: Getting a baseline

As with all machine learning modelling experiments, it's important to create a baseline model so you've got a benchmark for future experiments to build upon.

To create our baseline we'll use sklearn's multinomial naive bais using the tf-idf formula to convert our words to numbers.

> 🔑 Note: it's common practice to use non-DL algorithms as a baseline because of their speed and then later using DL to improve upon them.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_sentences)
X_train_counts.shape

(6851, 20076)

In [None]:
count_vect.vocabulary_.get(u'algorithm')

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(6851, 20076)

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(6851, 20076)

In [None]:
train_sentences[0]

'@mogacola @zamtriossu i screamed after hitting tweet'

In [None]:
#from sklearn.naive_bayes import MultinomialNB
#model_0 = MultinomialNB.fit(X=train_sentences, y=train_labels)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modeling pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()), # convert words into numbers using Tfidf
    ("clf", MultinomialNB())  # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

In [None]:
# Create tokenization and modeling pipeline (model 0 with 10 percent of training data)
#model_0_10_percent = Pipeline([
#    ("tfidf", TfidfVectorizer()), # convert words into numbers using Tfidf
#    ("clf", MultinomialNB())  # model the text
#])

# Fit the pipeline to the training data
#model_0_10_percent.fit(train_sentences_10_percent, train_labels_10_percent)

In [None]:
# Evaluate our baseline model
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f} %")

Our baseline model achieves an accuracy of: 79.27 %


In [None]:
# Evaluate our basline model with 10 percent training data
#baseline_score_10_percent = model_0_10_percent.score(val_sentences, val_labels)
#print(f"Our baseline 10 percent model achieves an accuracy of: {baseline_score_10_percent*100:.2f} %")

In [None]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1])

In [None]:
train_labels

array([0, 0, 1, ..., 1, 1, 0])

In [None]:
# Create a function that takes in y_true and y_preds and returns a dictionary of the 4 evaluation metrics. (accuracy, precision, recall, F1-score)
import numpy as np
from sklearn.metrics import f1_score

def compare_preds(y_trues, y_preds):
  accuracy = tf.keras.metrics.Accuracy()
  accuracy.update_state(y_trues, y_preds)

  precision = tf.keras.metrics.Precision()
  precision.update_state(y_trues, y_preds)

  recall = tf.keras.metrics.Recall()
  recall.update_state(y_trues, y_preds)

  F1score = f1_score(y_trues, y_preds, average="macro")

  scores = {
      "accuracy":accuracy.result(),
      "precision":precision.result(),
      "recall":recall.result(),
      "F1score":F1score
  }
  return scores

  #print(precision.result().numpy())
  #print(accuracy.result().numpy())

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates accuracy, precision, recall, f1 score of a binary classification model
  """
  #Calculate accuracy
  accuracy = accuracy_score(y_true, y_pred) * 100
  #Calculate others
  precision, recall, f1score, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")

  results = {
      "accuracy":accuracy,
      "precision":precision,
      "recall":recall,
      "f1score":f1score
  }

  return results

In [None]:
#baseline_preds
compare_preds(val_labels, baseline_preds)
#baseline_preds, val_labels

{'accuracy': <tf.Tensor: shape=(), dtype=float32, numpy=0.79265094>,
 'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.88617885>,
 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.62643677>,
 'F1score': 0.7820571304442272}

In [None]:
from helper_functions import calculate_results
baseline_results = calculate_results(val_labels, baseline_preds)

### Model 1: A Simple Dense Model



In [None]:
"""
# TensorBoard alternate loading (because site is offline)

%load_ext tensorboard

import datetime
import os
def create_tensorboard_callback(experiment_name):
  logfit_dir = "logs/fit/"
  path_experiment = os.path.join(logfit_dir, experiment_name, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=path_experiment
  )
  print(f"Saving TensorBoard log files to: {logfit_dir}")
  return tensorboard_callback
  """

'\n# TensorBoard alternate loading (because site is offline)\n\n%load_ext tensorboard\n\nimport datetime\nimport os\ndef create_tensorboard_callback(experiment_name):\n  logfit_dir = "logs/fit/"\n  path_experiment = os.path.join(logfit_dir, experiment_name, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))\n  tensorboard_callback = tf.keras.callbacks.TensorBoard(\n      log_dir=path_experiment\n  )\n  print(f"Saving TensorBoard log files to: {logfit_dir}")\n  return tensorboard_callback\n  '

In [None]:
# Create a TensorBoard callback (need to create a new one for each model)
from helper_functions import create_tensorboard_callback

# Create a directory to save TensorBoard logs
SAVE_DIR = "model_logs"

In [None]:
# Build model with the functional API
from tf_keras import layers
inputs = layers.Input(shape=(1,), dtype=tf.string)  #Inputs are 1-dimensional strings
x = text_vectorizer(inputs) #turn the input text into numbers
x = embedding(x)    # create an embedding of the numberized inputs
x = layers.GlobalAveragePooling1D()(x) #condense the feature vector for each token to one vector
outputs = layers.Dense(1, activation="sigmoid")(x)  # Create the output layers, want binary outputs so use sigmoid activation function
model_1 = tf.keras.Model(inputs,outputs,name="model_1_dense")

ValueError: All `inputs` values must be KerasTensors. Received: inputs=[<KerasTensor: shape=(None, 1) dtype=string (created by layer 'input_6')>] including invalid value KerasTensor(type_spec=TensorSpec(shape=(None, 1), dtype=tf.string, name='input_6'), name='input_6', description="created by layer 'input_6'") of type <class 'tf_keras.src.engine.keras_tensor.KerasTensor'>

In [None]:
model_1.summary()

NameError: name 'model_1' is not defined

In [None]:

# Build model with the sequential API

model_1_sequential = tf.keras.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string),
    layers.Dense(1, name="input_layer"),
    text_vectorizer,
    embedding,
    layers.Dense(64,activation="relu"),
    layers.Dense(1, activation="sigmoid", name="output_layer")
], name="model_1_sequential")


ValueError: Only instances of `keras.Layer` can be added to a Sequential model. Received: <tf_keras.src.layers.core.dense.Dense object at 0x7c5944800fa0> (of type <class 'tf_keras.src.layers.core.dense.Dense'>)

In [None]:
# Compile model
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics="accuracy")

# Fit the model
model_1_history = model_1.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                     experiment_name="model_1_dense")])

In [None]:
# Check the results
model_1.evaluate(val_sentences, val_labels)

In [None]:
# Make some predictions and evaluate those
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs.shape

In [None]:
# look at a single prediction
model_1_pred_probs[0]

In [None]:
# look at first 10 predictions
model_1_pred_probs[:10]

In [None]:
# Convert model prediction probabilities to label format
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:20]

In [None]:
# Convert model prediction probabilities to label format (on TEST dataset)
#my_predictions = tf.squeeze(tf.round(my_prediction_probs))
#my_predictions[:10]

In [None]:
# Calculate our model_1 results
model_1_results = calculate_results(y_true=val_labels,
                                    y_pred=model_1_preds)
model_1_results

In [None]:
import numpy as np
np.array(list(model_1_results.values())) > np.array(list(baseline_results.values()))

## Visualizing learned embeddings



In [None]:
# Get the vocabulary from the text vectorization layer
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

In [None]:
# Model 1 Summary
model_1.summary()

Now that we've got the embedding matrix our model has learned to represent our tokens, let's see how we can visualize it.

To do so, TensorFlow has a handy tool called Project (projector.tensorflow.org) and TF also has an incredible guide on Word Embeddings themselves. (https://www.tensorflow.org/text/guide/word_embeddings)

In [None]:
# Get the weight matrix of embedding layer
# These are the numerical representation of each time in our training data which have been learned for 5 epochs
embed_weights = model_1.get_layer("embedding").get_weights()[0]
print(embed_weights.shape)  #same size as vocab size and embedding_dim (output dim of our embedding layer)

In [None]:
# Create embedding files (we got this from tF word embeddings documentation)
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [None]:
# Download files from Colab to upload to projector
#try:
#  from google.colab import files
#  files.download('vectors.tsv')
#  files.download('metadata.tsv')
#except Exception:
#  pass

Jay Alammar illustrated word2vec: https://jalammar.github.io/illustrated-word2vec/

## Recurrent Neural Network - RNN

RNNs are useful sequence data.

The premise of a RNN is to use the representation of a previous input to aid the representation of a later input.  

** Resources: **
For more info> http://introtodeeplearning.com/

Chris Olah's intro to LSTMs: https://colah.github.io/posts/2015-08-Understanding-LSTMs/

Andre Carpathy's The Unreasonable Effectiveness of RNNs: https://karpathy.github.io/2015/05/21/rnn-effectiveness/

### Model 2: LSTM

LSTM - Long Short Term Memory (one of the most popular LSTM cells)

Our structure of an RNN typically looks like this:
```
Input (text) -> Tokenize -> Embedding -> Layers (RNNs/dense) -> Output (laber probability)
```

In [None]:
# Create an LSTM model
from tf_keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
#print(x.shape)
#x = layers.LSTM(units=64, return_sequences=True)(x) # when you're stacking RNN cells together, you need set return_sequences=True
#print(x.shape)
x = layers.LSTM(64)(x)
#print(x.shape)
#x = layers.Dense(64, activation="relu")(x)
#print(x.shape)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

In [None]:
# Get a summary
model_2.summary()

In [None]:
# Compile the model
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics="accuracy")

In [None]:
# Fit the model
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences,val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,"model_2_LSTM")])

In [None]:
# Make predictions with LSTM model
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs

In [None]:
# Convert model_2 pred probs to labels
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
print(model_2_preds.shape)
model_2_preds[:10]

In [None]:
# Calculate model_2 results
model_2_results = calculate_results(y_true=val_labels,
                                    y_pred=model_2_preds)
model_2_results

In [None]:
baseline_results

### Model 3: GRU

Another popular and effective RNN component is GRU (Gated Recurrent Unit)

The GRU cell has similar feature to the LSTM cell but has less parameters.

In [None]:
# Build an RNN using the GRU cell
from tf_keras import layers
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64)(x)
#print(x.shape)
#x = layers.GRU(64, return_sequences=True)(x) # if you want to stack recurrent layers on top of each other, you need return_sequences=True
#print(x.shape)
#x = layers.LSTM(42, return_sequences=True)(x)
#print(x.shape)
#x = layers.GRU(64)(x)
#print(x.shape)
#x = layers.Dense(64, activation="relu")(x)
#x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")

In [None]:
model_3.summary()

In [None]:
model_3.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics="accuracy")

model_3_history = model_3.fit(train_sentences,
            train_labels,
            epochs=5,
            validation_data=(val_sentences,val_labels),
            callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                   "model_3_GRU")])

In [None]:
# Make some predictions with our GRU model
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs[:10]

In [None]:
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

In [None]:
# Calculate model 3 results
model_3_results = calculate_results(val_labels, model_3_preds)
model_3_results

### Model 4: a bidirectional RNN

Normal RNNs go from left to right, like reading English. However a bidirectional RNN goes from right to left + left to right.

In [None]:
# Build bidirectional RNN in TensorFlow

from tensorflow import keras
from tf_keras import layers

#model_4 = keras.Sequential()
#model_4.add(layers.Bidirectional(layers.LSTM(10, return_sequences=True),
#                          input_shape=(1,)))
##model_4.add(layers.LSTM(10))
#model_4.add(layers.Dense(1,))
#model_4.add(layers.Activation("softmax"))

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
#x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs,outputs,name="model_4_bidirectional")

In [None]:
# Compile
model_4.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics="accuracy")

model_4.summary()

In [None]:
# Fit the model

model_4_history = model_4.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     "model_4_BIDIRECTIONAL")])

In [None]:
# Make predictions with our bidirectional model
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs

In [None]:
# Convert pred_probs to pred_labels
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

In [None]:
# Calculate results
model_4_results = calculate_results(val_labels, model_4_preds)
model_4_results

In [None]:
baseline_results

## Convolutional Neural Networks for Text (and other types of sequences)

We've used CNNs for images but images are typically 2D (height x width), however our text data is 1s.

Previously we've used Conv2D for our image data, but now we will use Conv1D.

The typical structure of a Conv1D model for sequences (in our case text) looks like this:

```
Inputs (text) -> Tokenization -> Embedding -> Layer(s) (typically Conv1D + Pooling) -> Outputs (class probabilities)

### Model 5: Conv1D ###

For different explanations of parameters see:
* https://poloclub.github.io/cnn-explainer/ (this is for 2d but can relate to 1d data)
* Difference between "same" and "valid" padding: https://stackoverflow.com/questions/37674306/what-is-the-difference-between-same-and-valid-padding-in-tf-nn-max-pool-of-t

In [None]:
# Test out our embedding layer, Conv1D layer and max pooling
from tf_keras import layers
embedding_test = embedding(text_vectorizer(["this is a test sentence"])) # turn target sequence into an embedding
conv_1d = layers.Conv1D(filters=32,
                        kernel_size=5,  #this is also referred to as an ngram of 5 (meaning it looks at 5 words at a time)
                        activation="relu",
                        padding="valid")

conv_1d_output = conv_1d(embedding_test) #pass test embedding through Conv1d layer
max_pool = layers.GlobalMaxPool1D()
max_pool_output = max_pool(conv_1d_output) # equivalent to "get the most important feature or get the feature with the highest value"

embedding_test.shape, conv_1d_output.shape, max_pool_output.shape

In [None]:
embedding_test

In [None]:
conv_1d_output

In [None]:
max_pool_output

<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[0.07606642, 0.12893276, 0.04685416, 0.03963244, 0.03266621,
        0.02844372, 0.03377763, 0.05978166, 0.07423814, 0.04644558,
        0.07084233, 0.06454812, 0.05894979, 0.08935377, 0.06154132,
        0.02082836, 0.10778599, 0.01146116, 0.1356203 , 0.05557969,
        0.05341431, 0.12952252, 0.0616392 , 0.08001035, 0.05808549,
        0.09503245, 0.06640893, 0.06298555, 0.05060241, 0.00689121,
        0.07780851, 0.03054681]], dtype=float32)>
      

In [None]:
# Create 1-dimensional convolutional layer to model sequences
from tf_keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
#x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Conv1D(filters=64, kernel_size=5, activation="relu", padding="valid", strides=1)(x)
x = layers.GlobalMaxPool1D()(x)
print(x.shape)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_5 = tf.keras.Model(inputs,outputs,name="model_5_Conv1D")

# Compile Conv1D
model_5.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics="accuracy")

# Get a summary
model_5.summary()

In [None]:
# Fit the model
model_5_history = model_5.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     "model_5_CONV1D")])

In [None]:
# Make some predictions with our Conv1D model
model_5_pred_probs = model_5.predict(val_sentences)
model_5_pred_probs[:10]

In [None]:
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_preds[:10]

In [None]:
model_5_results =calculate_results(val_labels, model_5_preds)

## Model 6: TensorFlow Hub Pretrained Sentence Encoder

Now that we've built a few of our own models, let's try and use transfer learning for NLP, specifically using TensorFlowHub's universal sentence encoder.

In [None]:
sample_sentence

In [None]:
import tensorflow_hub as hub

embed = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2")
embeddings = embed([sample_sentence,
    "The quick brown fox jumps over the lazy dog.",
    "I am a sentence for which I would like to get its embedding"])

print(embeddings[0][:50])

# The following are example embedding output of 512 dimensions per sentence
# Embedding for: The quick brown fox jumps over the lazy dog.
# [-0.03133016 -0.06338634 -0.01607501, ...]
# Embedding for: I am a sentence for which I would like to get its embedding.
# [0.05080863 -0.0165243   0.01573782, ...]


In [None]:
embeddings[0].shape

In [None]:
# Create Keras layer using the pretrained USE layer from TensorFlow Hub
sentence_encoder_layer = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name="USE")



In [None]:
# Create model using the sequential API
model_6 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(64, activation="relu"),
    layers.Dense(64, activation="relu"),
    layers.Dense(1, activation="sigmoid", name="output_layer"),
], name="model_6_USE")

# Compile the model
model_6.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics="accuracy")

In [None]:
model_6.summary()

In [None]:
# Train a classifier on top of USE parameters

In [None]:
# Fit the model
model_6_history = model_6.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     "model_6_encoder")])

# Fit the model on validation set
model_6_history_validation = model_6.fit(val_sentences,
                                         val_labels,
                                         epochs=5)

In [None]:
# Make predictions with USE TF Hub Model
model_6_pred_probs = model_6.predict(val_sentences)
model_6_pred_probs[:10]

In [None]:
# Convert prediction probabilities to labels
model_6_preds = tf.squeeze(tf.round(model_6_pred_probs))
model_6_preds[:10]

In [None]:
# Calculate model_6 performance metrics
model_6_results = calculate_results(val_labels, model_6_preds)
model_6_results

In [None]:
baseline_results

## Model 7: TF Hub Pretrained USE but with 10% of training data

Transfer learning really helps when you don't have a large data set.  
To see how our model performs on a small data set, let's replicate `model_6` except we'll train it on 10% of the data.

In [None]:
## NOTE: Making data splits like below leads to data leakage (model_7 trained on 10% data outperforms model_6 trained on 100% data)
## DO NOT MAKE DATA SPLITS THAT LEAK DATA FROM VALIDATION/TEST SETS INTO TRAINING SET

# Create subsets of 10% of the training data.
train_10_percent = train_df_shuffled
train_10_percent = train_10_percent[["text","target"]].sample(frac=0.1, random_state=42)
#len(train_10_percent), train_10_percent.head()
train_sentences_10_percent = train_10_percent["text"].to_list()
train_labels_10_percent = train_10_percent["target"].to_list()
len(train_sentences_10_percent), len(train_labels_10_percent)

> 🔑 ** NOTE: ** Be very careful when making train/val/test splits that you don't leak data across the datasets.  Otherwise your model evaluation metrics will be wrong.  
If something looks to good to be true, a model trained on 10% of the data outperforming a model trained on the same model trained on 100% of the data, trust your gut and go back through where the error may lie.

In [None]:
# Making a better dataset split (no data leakage)
train_10_percent_split = int(0.1 * len(train_sentences))
train_sentences_10_percent = train_sentences[:train_10_percent_split]
train_labels_10_percent = train_labels[:train_10_percent_split]
len(train_labels_10_percent)

In [None]:
pd.Series(np.array(train_labels_10_percent)).value_counts()

In [None]:
train_10_percent[:10]

In [None]:
# Check the number of targets in our subset of data
train_10_percent["target"].value_counts()

## Model 7:

In [None]:
# Create model 7
model_7 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(64,activation="relu"),
    layers.Dense(1, activation="sigmoid", name="output_layer")
], name="model_7_USE_10_percent")

# Compile model 7
model_7.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics="accuracy")

model_7.summary()

In [None]:
# Fit the model
history_model_7 = model_7.fit(train_sentences_10_percent,
                              train_labels_10_percent,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     "tf_hub_sentence_encoder_10_percent_correct_split")])

In [None]:
#cloned model 7
#model_7_clone = tf.keras.models.clone_model(model_6)

# Compile model 7 clone
#model_7_clone.compile(loss="binary_crossentropy",
#                optimizer=tf.keras.optimizers.Adam(),
#                metrics="accuracy")

#model_7_clone.summary()



In [None]:
# Fit the cloned model
#history_model_7_clone = model_7_clone.fit(train_sentences_10_percent,
#                              train_labels_10_percent,
#                              epochs=5,
#                              validation_data=(val_sentences, val_labels),
#                              callbacks=[create_tensorboard_callback(SAVE_DIR,
#                                                                     "model_7_cloned_encoder")])


In [None]:
# Get predictions model 7
#model_7_clone_preds = model_7_clone.predict(val_sentences)
#model_7_clone_preds[:10]

In [None]:
#model_7_clone_pred_probs = tf.squeeze(tf.round(model_7_clone_preds))
#model_7_clone_pred_probs[:10]

In [None]:
# Turn pred probs into labels
#model_7_clone_results = calculate_results(val_labels, model_7_clone_pred_probs)
#model_7_clone_results

In [None]:
baseline_results

## Comparing the performance of each our models ##


In [None]:
# Combine model results into a dataframe
all_model_results = pd.DataFrame({"0_baseline": baseline_results,
                                  "1_simple_dense": model_1_results,
                                  "2_lstm": model_2_results,
                                  "3_gru": model_3_results,
                                  "4_bidirectional": model_4_results,
                                  "5_conv1d": model_5_results,
                                  "6_tf_hub_use_encoder": model_6_results,
                                  "7_tf_hub_use_encoder_10_percent": "hi",#model_7_clone_results
                                  })
all_model_results = all_model_results.transpose()
all_model_results

In [None]:
# Reduce the accuracy to the same scale as the other metrics
#all_model_results["accuracy"] = all_model_results["accuracy"] / 100
#all_model_results

In [None]:
# Plot and compare all of the model results
#all_model_results.plot(kind="bar", figsize=(10,7)).legend(bbox_to_anchor=(1.0,1.0))

In [None]:
# Sort model results by F1-score
#all_model_results.sort_values("f1", ascending=False)["f1"].plot(kind="bar", figsize=(8,4))

## Uploading our model training logs to TensorBoard.Dev

We can further inspect our model's performance using tensorboard.dev

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
import datetime, os

logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

#history_model_8_clone = model_7_clone.fit(train_sentences_10_percent,
#                              train_labels_10_percent,
#                              epochs=5,
#                              validation_data=(val_sentences, val_labels),
#                              callbacks=[tensorboard_callback])

In [None]:
# View TensorBoard logs of tranfer learning modelling experiments (plus all other models)
# Upload TensorBoard.Dev records
!tensorboard dev upload

Now that I've ran the cell above, my modelling experiments are visible on TensorBoard.dev.  

** 📃 Note: ** TensorBoard is good for quickly showing experiments but for larger projects use Weights and Biases: https://wandb.ai/site

## Saving and loading a trained model: ##

There are two main formats to save a model to in TensorFlow:
1. HDF5 format
2. The `SavedModel` format (this is the default when using TensorFlow)



In [None]:
model_6_results

In [None]:
# Save our TF Hub Sentence Encoder model to HDF5 format
model_6.save("model_6.h5")

In [None]:
# Load model with custom Hub Layer (requires HDF5 format)
import tensorflow_hub as hub
loaded_model_6 = tf.keras.models.load_model("model_6.h5",
                                            custom_objects={"KerasLayer": hub.KerasLayer})

In [None]:
# How does our loaded model perform?
loaded_model_6.evaluate(val_sentences, val_labels)

In [None]:
model_6_results

Now let's save to the `SavedModel` format... (more on this here:
https://www.tensorflow.org/tutorials/keras/save_and_load)

In [None]:
# Save TF Hub Sentence Encoder model to SavedModel format (default)
model_6.save("model_6_SavedModel_format")

In [None]:
# Load in a model from the SavedModel format.
loaded_model_6_savedmodel = tf.keras.models.load_model("model_6_SavedModel_format")

In [None]:
loaded_model_6_savedmodel.evaluate(val_sentences, val_labels)

## Find the most wrong examples

* If our best model still isnt perfect, which examples is it still geting wrong?  And of these wrong examples, which ones is it getting most wrong?
So the ones with the prediction probability closest to the opposite clas
For example if a sample should have a label of 0, but our model predicts a prediction probability of 0.999 (really close to 1) and vice versa

In [None]:
make_confusion_matrix(val_labels, model_6_preds)

In [None]:
# Download a pretrained model from Google Storage
!wget https://storage.googleapis.com/ztm_tf_course/08_model_6_USE_feature_extractor.zip
!unzip 08_model_6_USE_feature_extractor.zip

In [None]:
model_6_daniels = tf.keras.models.load_model("08_model_6_USE_feature_extractor")
model_6_daniels.evaluate(val_sentences, val_labels)

In [None]:
model_6_daniels_pred_probs = model_6_daniels.predict(val_sentences)
model_6_daniels_preds = tf.squeeze(tf.round(model_6_daniels_pred_probs))
model_6_daniels_preds[:10]

In [None]:
# Create a DataFrame with validation sentences, validation labels, and best performing model prediction labels + probabilities

import pandas as pd

max_preds_disaster_imported = pd.DataFrame({"text":val_sentences,
                                            "y_true":val_labels,
                                            "y_pred":model_6_daniels_preds,
                                            "pred_conf":tf.squeeze(model_6_daniels_pred_probs)})
max_preds_disaster_imported

In [None]:
# Create a dataframe with TEST sentences and predictions

#test_preds_disaster = pd.DataFrame({"text":test_df_shuffled["text"].to_numpy(),
#                                    "pred":my_predictions,
#                                    "prob":tf.squeeze(my_prediction_probs)})
#test_preds_disaster

In [None]:
max_preds_disaster_imported["pred_correct"] = max_preds_disaster_imported["y_true"] == max_preds_disaster_imported["y_pred"]
max_preds_disaster_imported

In [None]:
top_100_wrong = max_preds_disaster_imported[max_preds_disaster_imported["pred_correct"]==False].sort_values("pred_conf", ascending=False)[:100]

top_100_wrong[:10].style.set_caption("my title")
top_100_wrong["y_true"] = top_100_wrong["y_true"].replace({0:"not disaster",
                                                           1:"disaster"})
top_100_wrong["y_pred"] = top_100_wrong["y_pred"].replace({0:"not disaster",
                                                           1:"disaster"})
top_100_wrong

In [None]:
#test_preds_disaster_converted = test_preds_disaster.sort_values("prob", ascending=False)
#test_preds_disaster_converted["pred"] = test_preds_disaster["pred"].replace({0:"not disaster",
#                                                                             1:"disaster"})
#test_preds_disaster_converted

In [None]:
# Check the highest probability disasters from test data set
#for row in test_preds_disaster_converted[:10].itertuples():
#  _, text, pred, prob = row
#  print(f"Pred: {pred}, Prob: {prob}")
#  print(f"Text: \n{text}\n")
#  print("----\n")

In [None]:
# Check the lowest probability disasters from test data set
#for row in test_preds_disaster_converted[:-10].itertuples():
#  _, text, pred, prob = row
#  print(f"Pred: {pred}, Prob: {prob}")
#  print(f"Text: \n{text}\n")
#  print("----\n")

In [None]:
import pandas as pd

max_preds_disaster = pd.DataFrame({"y_true":val_labels,
                                   "y_pred":model_6_preds,
                                   "pred_conf":model_6_pred_probs.max(axis=1)})
max_preds_disaster

In [None]:
# Check the false positives (model predicted 1 when should have been 0)
for row in top_100_wrong[:10].itertuples():
  _, text, y_true, y_pred, pred_conf, _ = row
  print(f"Label: {y_true}, Pred: {y_pred,}, Prob: {pred_conf}")
  print(f"Text: \n{text}\n")
  print("----\n")

In [None]:
# Check the false negatives (model predicted 0 when should have been 1)
for row in top_100_wrong[-10:].itertuples():
  _, text, y_true, y_pred, pred_conf, _ = row
  print(f"Label: {y_true}, Pred: {y_pred}, Prob: {pred_conf}")
  print(f"Text: \n{text}\n")
  print("----\n")

## Your challenge: Predicting on tweets from the wild

Pass the tweets through the model... is the tweet a disaster or not a disaster?

# New Section

In [None]:
#1. Prep data
random_tweets = ["Pakistan responded by striking Iranian territory.",
                 "Only a handful of mostly middle-ranking police and other officials were indicted on criminal negligence and similar charges last year, while top government officials, like the home minister, were cleared of wrongdoing.",
                 "On the court, Andreeva is a series of beguiling contradictions.",
                 "It said that its priority was to protect civilians and, through diplomacy with American, Arab and African partners, to seek a peaceful solution to the conflict.",
                 "The trouble, critics concurred, was that Mr. Schickele was a victim of his own prodigious ability as a pasticheur.",
                 "As my breath returned to its regular rate, Haas told me that he valued my music, but that I would need to start believing in myself."]


In [None]:
custom_prediction_probs = model_6.predict(random_tweets)
custom_prediction_probs

In [None]:
for random_tweet in random_tweets:
  pred_prob = model_6.predict([random_tweet])
  pred = tf.round(pred_prob)
  print(f"Pred: {pred}, Prob:{pred_prob}")
  print(f"Text: {random_tweet}")
  print(f"---------")

## Extracurricular

In [None]:
# Build model with the sequential API

model_1_sequential = tf.keras.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string, name="input_layer0"),
    #layers.Dense(1, name="input_layer"),
    text_vectorizer,
    embedding,
    layers.GlobalAveragePooling1D(),
    layers.Dense(1, activation="sigmoid", name="output_layer")
], name="model_1_sequential")

model_1_sequential.summary()

In [None]:
# Compile the model
model_1_sequential.compile(loss="binary_crossentropy",
                           optimizer=tf.keras.optimizers.Adam(),
                           metrics="accuracy")

# Fit the model
model_1_sequential.fit(train_sentences,
                       train_labels,
                       epochs=5,
                       validation_data=(val_sentences, val_labels),
                       callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                              "model_1_sequential")])

In [None]:
# Create an LSTM model - Model 2 with Sequential

model_2_sequential = tf.keras.Sequential([
    layers.Input(shape=(1,), dtype="string"),
    text_vectorizer,
    embedding,
    layers.LSTM(64),
    layers.Dense(1, activation="sigmoid", name="model_2_LSTM_sequential")
])

In [None]:
# Compile model 2 sequential
model_2_sequential.compile(loss="binary_crossentropy",
                           optimizer=tf.keras.optimizers.Adam(),
                           metrics="accuracy")

model_2_sequential.fit(train_sentences,
                       train_labels,
                       epochs=5,
                       validation_data=(val_sentences,val_labels),
                       callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                              "model_2_sequential_log")])

In [None]:
# Create 1-dimensional convolutional layer to model sequences (model 5 but Sequential)
model_5_sequential = tf.keras.Sequential([
    layers.Input(shape=(1,), dtype=tf.string, name="input_layer"),
    text_vectorizer,
    embedding,
    layers.Conv1D(filters=64, kernel_size=5, activation="relu", padding="valid", strides=1),
    layers.GlobalMaxPool1D(),
    layers.Dense(1, activation="sigmoid")
])

model_5_sequential.compile(loss="binary_crossentropy",
                           optimizer=tf.keras.optimizers.Adam(),
                           metrics="accuracy")

model_5_sequential.summary()

model_5_sequential.fit(train_sentences,
                       train_labels,
                       epochs=5,
                       validation_data=(val_sentences, val_labels),
                       callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                              "model_5_sequential_logs")])

In [None]:
# Create Keras layer using the pretrained USE layer from TensorFlow Hub with Trainable=TRUE
#sentence_encoder_layer_trainable = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
#                                        input_shape=[],
#                                        dtype=tf.string,
#                                        trainable=True,
#                                        name="USE")

In [None]:
# Create model using sequential API and USE layer is trainable now
#model_6_trainable = tf.keras.Sequential([
#    sentence_encoder_layer_trainable,
#    layers.Dense(64, activation="relu"),
#    layers.Dense(64, activation="relu"),
#    layers.Dense(1, activation="sigmoid", name="output_layer"),
#    ],
#                                        name="model_6_USE_trainable")

In [None]:
# compile model 6 trainable
#model_6_trainable.compile(loss="binary_crossentropy",
#                          optimizer=tf.keras.optimizers.Adam(),
#                          metrics="accuracy")

#model_6_trainable_history = model_6_trainable.fit(train_sentences,
#                                                  train_labels,
#                                                  epochs=5,
#                                                  validation_data=(val_sentences, val_labels),
#                                                  callbacks=[create_tensorboard_callback(SAVE_DIR,
#                                                                                         "model_6_trainable_logs")])

In [None]:
#model_6_trainable.evaluate(val_sentences, val_labels)

In [None]:
# Extracurricular exercise 4:  Train the best model on the whole training data and not on split

# Use train_test_split to split the data into training and validation sets (except no validation split on this one)
train_sentences2, val_sentences2, train_labels2, val_labels2 = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                         train_df_shuffled["target"].to_numpy(),
                                                         test_size=1,
                                                         random_state=42)

len(train_sentences2), len(train_labels2), len(val_sentences2), len(val_labels2)




In [None]:
# Build the model
#model_6_full = tf.keras.Sequential([
#    sentence_encoder_layer,
#    layers.Dense(64, activation="relu"),
#    layers.Dense(64, activation="relu"),
#    layers.Dense(1, activation="sigmoid", name="output_layer"),
#], name="model_6_USE_full_training")

#model_6_full.compile(loss="binary_crossentropy",
#                     optimizer=tf.keras.optimizers.Adam(),
#                     metrics="accuracy")

#history_model_6_full = model_6_full.fit(train_sentences2,
#                                        train_labels2,
#                                        epochs=5,
#                                        callbacks=[create_tensorboard_callback(SAVE_DIR,
#                                                                               "model_6_full_training_logs")])

In [None]:
#model_6_full.evaluate(val_sentences, val_labels)

In [None]:
# Make predictions on the test dataset
#model_6_full_pred_probs = model_6_full.predict(test_df_shuffled["text"].to_numpy())
#model_6_full_pred_probs

In [None]:
#model_6_full_preds = tf.squeeze(tf.round(model_6_full_pred_probs))
#model_6_full_preds

In [None]:
test_df_shuffled

In [None]:
# convert model_6_full_preds into ints
#model_6_full_preds_ints = [int(item) for item in model_6_full_preds]
#print(model_6_full_preds_ints)

# adding the preds column from model_6_full to format it into the sample_submission.csv format

#test_df_shuffled["preds"] = model_6_full_preds
#test_df_shuffled

In [None]:
# drop unnecessary columns
#test_df_shuffled = test_df_shuffled.drop(columns=["keyword", "location", "text"])


#test_df_shuffled = test_df_shuffled.rename(columns={"preds":"target"})
#test_df_shuffled

In [None]:
# convert the target column to ints because pandas made it a float for some reason
#test_df_shuffled["target"] = test_df_shuffled["target"].astype(int)
#sorted_df = test_df_shuffled.sort_values("id", ascending=True)
#sorted_df

In [None]:
# Create CSV from pandas df
#sorted_df.to_csv("my_submission.csv",index=False)

In [None]:
#len(sorted_df)

In [None]:
# Extra curriculum challenge 5 - Use ensemble predictions to get the majority vote (mode) of all the models

#models: model_0, model_1, model_2, model_3, model_4, model_5, model_6, model_6_trainable, model_6_full, model_7

# check scikit-learn version
import sklearn
from sklearn.ensemble import VotingClassifier
#print(sklearn.__version__)

models = [("model_1_dense", model_1), ("pipeline", model_0)]
ensemble = VotingClassifier(estimators=models, voting="soft")

print(ensemble)


ensemble.fit(train_sentences, train_labels)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

estimators = []
log_reg = LogisticRegression(solver='liblinear')
estimators.append(('Logistic', log_reg))

tree = DecisionTreeClassifier()
estimators.append(('Tree', tree))

svm_clf = SVC(gamma='scale')
estimators.append(('SVM', svm_clf))

voting = VotingClassifier(estimators=estimators)
voting.fit(train_sentences, train_labels)

In [None]:
# Extracurricular challenge #6: Make a confusion matrix with the best performing model's predictions
# on the validation set and the validation ground truth labels.

make_confusion_matrix(val_labels, model_6_preds)