# Introduction to NLP fundamentals in Tensorflow

NLP has the goal of deriving information out of natural languages or sequences

Another common term for NLP problems is sequence to sequence problems (seq2seq)

In [1]:
## Check for GPU
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-1a884416-69cd-a6f0-4b15-94e83038fbb2)


## GET helper functions

In [2]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2024-06-09 10:59:43--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py.3’


2024-06-09 10:59:43 (116 MB/s) - ‘helper_functions.py.3’ saved [10246/10246]



In [3]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

## DATASET:
Kaggle intro to NLP dataset
Binary Classification (disaster or non disaster)

In [4]:
# !wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

In [5]:
# unzip_data("nlp_getting_started.zip")

## Visualize a text dataset

In [6]:
# we have a less size dataset so we could use pandas to visualize this otherwise use other methods

import pandas as pd

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [7]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
train_df["text"][10], train_df["target"][10]

('Three people died from the heat wave so far', 1)

In [9]:
## Shuffle training dataframe

train_df_shuffled = train_df.sample(frac=1, random_state=2)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
3190,4579,emergency%20plan,North Hastings Ontario,Practice your families fire escape plan so eve...,0
6171,8801,sirens,"Nomad, USA",Fuck Sleeping With Sirens.,0
1196,1722,buildings%20burning,,'i'm a Gemini' *children screaming buildings b...,1
680,982,blazing,"Pig Symbol, Alabama",Montgomery come for the blazing hot weather......,1
3358,4808,evacuated,,I got evacuated from the cinema 30 mins throug...,0


In [10]:
## Test dataframe

test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [11]:
## How many examples of each class are there?
## check whether we need to handle the imbalance dataset or not.


train_df.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [12]:
## How many total samples?

len(train_df), len(test_df)

(7613, 3263)

In [13]:
## Visualizing the random training examples

import random

random_index = random.randint(0, len(train_df) -5)

for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print("_______________________________________________________________")

Target: 1 (real disaster)
Text:
Read an eyewitness account from #Hiroshima from this day in 1945 http://t.co/njAffyjaRz http://t.co/1xHSuEwQn4 #LIFE

_______________________________________________________________
Target: 1 (real disaster)
Text:
.@karijobe and her band killed it tonight.  It was almost loud enough to drown out the tambourine behind me..... @codycarnes @AG_USA

_______________________________________________________________
Target: 0 (not real disaster)
Text:
.@jimmyfallon I crushed squirrel bones with a mortar and pestle for my school's bio dept. not really sure why #WorstSummerJob

_______________________________________________________________
Target: 1 (real disaster)
Text:
@Sport_EN Just being linked to Arsenal causes injury.

_______________________________________________________________
Target: 1 (real disaster)
Text:
CONFIRMED: Sanchez Hazard and Bolasie will be out for the rest of the season. https://t.co/7Ct01nEptL

___________________________________________

In [14]:
### Split data into training and validation dataset

from sklearn.model_selection import train_test_split

In [15]:
## Using train_test_split to split training data into training and validation datasets

train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_df_shuffled["text"].to_numpy(),
    train_df_shuffled["target"].to_numpy(),
    test_size=0.1,
    random_state=2
)

In [16]:
train_sentences.shape, val_sentences.shape, train_labels.shape, val_labels.shape

((6851,), (762,), (6851,), (762,))

In [17]:
## check the dataset

train_sentences[2], train_labels[2]

('On the sneak America has us spoiled. A natural disaster will humble niggas.',
 1)

## Text Representation

In [18]:
train_sentences[:5]

array(['We have different moral systems. Mine rejects the mass murder of innocents yours explicitly endorses such behavior. https://t.co/qadRKEJZ9T',
       '@tsunami_esh ?? hey Esh',
       'On the sneak America has us spoiled. A natural disaster will humble niggas.',
       '~ More wicked weather rolls through Calgary and surrounding areas http://t.co/SxwJyR3K3l http://t.co/aEWGlVqReH',
       '@b24fowler I see that! Crazy how this line blew up.'],
      dtype=object)

In [19]:
## Tokenization

import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# using the default TextVectorization parameters

text_vectorizer = TextVectorization(
    max_tokens = None, # how many words in the vocabulary (automatically add <OOV>)
    standardize = "lower_and_strip_punctuation",
    split = "whitespace",
    ngrams = None, # create a group of ngrams
    output_mode = "int", # how to map tokens to numbers
    output_sequence_length = None, # how long do we want our sequences to be
    # pad_to_max_tokens = True # add padding
)

In [20]:
split_1 = train_sentences[0].split()

In [21]:
len(split_1)

18

In [22]:
# Find the average number of tokens(words) in the training tweets

round(sum([len(i.split()) for i in train_sentences]) / len(train_sentences))

15

In [23]:
# setup text vectorization variable

max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be (how much a model will see from the tweet)

text_vectorizer = TextVectorization(
    max_tokens = max_vocab_length,
    output_sequence_length = max_length,
    pad_to_max_tokens = True
)

In [24]:
# Fit the text vectorizer to the training text

text_vectorizer.adapt(train_sentences)

In [25]:
# Create a sample sentence and tokenize it

sample_sentence = "There's a flood in my street."
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[299,   3, 228,   4,  13, 734,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [26]:
sample_roman_nepali = ["Mero ghar agaadi baadi aayo.", "k chha khabar"]
text_vectorizer(sample_roman_nepali)

<tf.Tensor: shape=(2, 15), dtype=int64, numpy=
array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

In [27]:
random_sentence = random.choice(train_sentences)
print(f"Original text:\n {random_sentence}\n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
 @mallelis have you gotten to the post-battle we're-on-a-desolate-planet below-the-Mason-Dixon-Line style electro violin playing yet?

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[   1,   24,   12, 2444,    5,    2, 4837, 6815,    1, 1972, 5712,
        6999,  837,  533,    0]])>

In [28]:
words_in_vocab = text_vectorizer.get_vocabulary()

top_10_words = words_in_vocab[:10]
bottom_10_words = words_in_vocab[-10:]

In [29]:
words_in_vocab[0], words_in_vocab[13], words_in_vocab[100]

('', 'my', 'got')

In [30]:
print(f"Number of words in vocab: {len(words_in_vocab)}")

print(f"Top 10 vocab words: {top_10_words}")
print(f"Bottom 10 vocab words: {bottom_10_words}")

Number of words in vocab: 10000
Top 10 vocab words: ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is']
Bottom 10 vocab words: ['pams', 'pampered', 'pampalmater', 'palmoil', 'palmer', 'palm', 'palinfoen', 'palestinian\x89Û', 'paleface', 'pale']


## Creating an Embedding using an Embedding Layer

The parameters for embedding layer are:
* input_dim = the size of our vocabulary
* output_dim = the size of the output embedding vector, for eg a value of 100 would mean each token gets represented by a vector 100 long
* input_length = length of the sequence being passed to the embedding layer

In [31]:
from tensorflow.keras import layers

embedding = layers.Embedding(
    input_dim = max_vocab_length,
    output_dim = 128, # output shape
    input_length = max_length # 15 how long is each input
)

In [32]:
embedding

<keras.src.layers.core.embedding.Embedding at 0x784a59cf3340>

In [33]:
# Get a random sentence from the training set

random_sentence = random.choice(train_sentences)

print(f"Original text:\n {random_sentence}\n\n")
print("Embedded Version:\n")

# Embed the random sentence into embedding vector (dense vector of fixed size)
# the text should be represented in integer first (tokenized first)
sample_embed = embedding(
    text_vectorizer([random_sentence])
)
sample_embed

Original text:
 What's wrong with just a lil smoke and good conversation ????


Embedded Version:



<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.04726757,  0.01670973,  0.04904727, ..., -0.01227682,
         -0.03573707, -0.04593468],
        [ 0.02221776, -0.04749314, -0.02378759, ...,  0.03347215,
         -0.01512201,  0.00281183],
        [-0.00900116,  0.03317356,  0.00401207, ...,  0.0275516 ,
          0.03668603, -0.02313507],
        ...,
        [ 0.00279291, -0.04649302, -0.01142583, ..., -0.01914542,
          0.01583065, -0.04471234],
        [ 0.00279291, -0.04649302, -0.01142583, ..., -0.01914542,
          0.01583065, -0.04471234],
        [ 0.00279291, -0.04649302, -0.01142583, ..., -0.01914542,
          0.01583065, -0.04471234]]], dtype=float32)>

In [34]:
sample_embed.shape

# 1 sample
# 15 tokens
# 128 vector each

TensorShape([1, 15, 128])

In [35]:
## checkout single token's embedding

sample_embed[0][0], sample_embed[0][0].shape, random_sentence.split()[0]

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.04726757,  0.01670973,  0.04904727, -0.04685175, -0.01963326,
        -0.00907912, -0.02427278,  0.03266587, -0.0338637 ,  0.01050285,
        -0.00965906,  0.02987982,  0.03331106, -0.03346853,  0.00928337,
        -0.02656658, -0.0335229 ,  0.01912672,  0.02131991,  0.02984195,
         0.03741455, -0.02174586, -0.02459011,  0.01945672,  0.01218174,
        -0.04193943, -0.01439489,  0.04658927,  0.02675501,  0.00709198,
        -0.03154166, -0.00891539,  0.03908256, -0.01491009,  0.01347896,
        -0.01935532, -0.04824799, -0.04408915,  0.00118147,  0.02766338,
        -0.0180411 ,  0.00195096, -0.02583572,  0.00742159, -0.04240733,
         0.02009088,  0.02519837,  0.02009239, -0.01995727,  0.02233947,
        -0.00239402, -0.04499523,  0.00160396, -0.03606279, -0.02254009,
         0.04367675, -0.00795473,  0.01061885,  0.0380662 ,  0.0373693 ,
        -0.01061829,  0.03230781,  0.03144096, -0.01316042, -0.04544898,
  

## Modelling a text dataset (running series of experiments)

Now our text data is converted into numbers now we can run series of modelling experiments.

* Model 0: Naive Bayes
* Model 1: Feed-Forward neural network (dense model)
* Model 2: LSTM
* Model 3: GRU
* Model 4: Bidirectional-LSTM
* Model 5: 1D Convolutional Neural Network
* Model 6: Tensorflow Hub Pretrained Feature Extractor (using transfer learnig for NLP)
* Model 7: Same as model 6 with 10% of training data

## MODEL 0: Naive Bayes (baseline model)

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()), # text representation
    ("clf", MultinomialNB()) # model
])

# Fit the pipeline to the training data
model_0.fit(
    train_sentences,
    train_labels
)

In [37]:
# Evaluate our baseline model

baseline_score = model_0.score(
    val_sentences, val_labels
)
print(f"Baseline model accuracy: {baseline_score*100:.2f}%")

Baseline model accuracy: 78.48%


In [38]:
train_df.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [39]:
# Make predictions

baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0])

In [40]:
## Creating a function to track evaluation methods
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate_preds(y_true, y_pred):
  """Evaluate your prediction

  parameters:
  y_true = true labels
  y_pred = predicted labels
  """
  accuracy = accuracy_score(y_true, y_pred) * 100
  precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average = "weighted")

  model_results = {
      "accuracy": accuracy,
      "precision": precision * 100,
      "recall": recall * 100,
      "f1": f1 * 100
  }
  return model_results

In [41]:
model_0_results = evaluate_preds(val_labels, baseline_preds)
model_0_results

{'accuracy': 78.4776902887139,
 'precision': 80.69823266996366,
 'recall': 78.4776902887139,
 'f1': 77.77653657404298}

## Model 1: Feed-Forward Neural Network (dense model)

In [42]:
# Create a tensorboard callback

from helper_functions import create_tensorboard_callback

# Create a directory to save TensorBoard logs
SAVE_DIR = "model_logs"

In [43]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype = tf.string) # inputs are 1 dimensional strings
x = text_vectorizer(inputs) # turn the input texts into numbers
x = embedding(x) # create embedding of the numberized input
x = layers.GlobalAveragePooling1D()(x) # condense the feature vector for each token to one vector
outputs = layers.Dense(1, activation = "sigmoid")(x) # output layer

model_1 = tf.keras.Model(inputs, outputs, name = "model_1_dense")



In [44]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1280129 (4.88 MB)
Trainable params: 128

In [45]:
# compiling the model

model_1.compile(
    loss = "binary_crossentropy",
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ["accuracy"]
)

In [46]:
train_labels[:10]

array([1, 0, 1, 1, 0, 1, 0, 1, 0, 0])

In [47]:
# Fit the model into train and validation data
model_1_history = model_1.fit(
    x = train_sentences,
    y = train_labels,
    epochs = 10,
    validation_data = (val_sentences, val_labels),
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [48]:
model_1.evaluate(val_sentences, val_labels)



[0.6548654437065125, 0.7650918364524841]

In [51]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs[0]



array([0.9998834], dtype=float32)

## Learn about GlobalAveragePool1D()

In [52]:
# Convert model prediction probabilities to label formats

model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))

In [53]:
print(evaluate_preds(val_labels, model_1_preds))

{'accuracy': 76.50918635170603, 'precision': 76.52465539245115, 'recall': 76.50918635170603, 'f1': 76.41969683634184}
