# Introduction to NLP Fundematals in TensorFlow
NLP has the goal of deriving information out of natural language(could be sequences text or speech)
Another common term for NLP problems is sequence to squence problems(seq2seq)

In [None]:
!nvidia-smi -L

## Get helper functions

In [None]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/refs/heads/main/extras/helper_functions.py

In [None]:
from helper_functions import create_tensorboard_callback,unzip_data,plot_loss_curves,compare_historys

## Get a text dataset
the dataset we're going to be using is Kaggle's introduction to NLP dataset(text samples of tweets labelled as diaster or not diaster).

In [None]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

In [None]:
# unzip the data
unzip_data("nlp_getting_started.zip")

In [None]:
import tensorflow as tf
import pandas as pd


In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
train_df.head()

In [None]:
# shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1,random_state=42)

In [None]:
train_df_shuffled.head()

In [None]:
# what does the tesst dataframe look like
test_df.head()

In [None]:
# how many examples of each class?
train_df.target.value_counts()

In [None]:
len(train_df),len(test_df)

In [None]:
# Let's visualize some random training examples
import random
random_index = random.randint(0,len(train_df)-5)
for row in train_df_shuffled[["text","target"]][random_index:random_index+5].itertuples():
    _,text,target = row 
    print(f"target: {target}","(real diaster)" if target > 0 else "(not real diaster)" )
    print(f"Text:\n{text}\n")
    print(10*"__","\n")

### Split data into training and validation datasets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                           test_size=0.1,
                                                                           random_state=42)

In [None]:
len(train_sentences),len(train_labels),len(val_sentences),len(val_labels)

In [None]:
# check the first ten samples
train_sentences[:10],train_labels[:10]

## Converting text into number, machine learning don't know text.
when dealing with a text problem, one of the first things you'll have to do before you can build a model is to convert your text to numbers.
There are a few ways to do this, namely:
* *Tokenization* - direct mapping of token (a token could be a word, a character or in between) to number.
* *Embedding* - Create a matrix of feature vector for each token ( the size of the feature vector can be defined and this embedding can be learned)

### Text vecorization(tokenization)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [None]:
text_vectorizer = TextVectorization(max_tokens=None, # how many word in the vocablary (automatically add <OOV>
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None, # create groups of n-words
                                    output_mode="int", # how to map token to number
                                    output_sequence_length=None ,# how long do you want your sequence to be
                                    )

In [None]:
# find the average number of tokens(words) in the training tweets
round(sum([len(i.split()) for i in train_sentences]))/len(train_sentences)

In [None]:
# setup  text vectorization variables
max_vocab_length =10000 # max number of words to have in our vocablary
max_length = 15 # max lenght our sequences will be ( e.g how many words from a tweet does a model see?)


In [None]:
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length
                                   )

In [None]:
# Fit the text vectorizer to the training sentence
text_vectorizer.adapt(train_sentences)

In [None]:
# Create a sample sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

In [None]:
# Choose a random sentece from the training dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Original text:\n {random_sentence} \n\nVectorized Version: {text_vectorizer(random_sentence)} ")
text_vectorizer(random_sentence)

In [None]:
# check if token is a sentence have the same int value across different sentences
sample_sentence_two = "schools are the best Western in Lit lit litterally.. LiTTErALLy.."
text_vectorizer(sample_sentence_two)

In [None]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary() # get all of the unique words in vocabulary
top_5_words = words_in_vocab[:5] # get the most common word
bottom_5_words = words_in_vocab[-5:] # get the least common word
top_5_words,bottom_5_words,len(words_in_vocab)

### Creating and Embedding using an Embedding Layer
To make our embedding, we going to use tensorflow embedding layer
The parameters we care most about for our embedding layer:
* `input_dim` = the size of our vocabulary
* `output_dim` = the size of the output embedding vector,for example, a value of 100 would mean each token gets represented by a vector 100 long
*  `input_length` = length of the sequences being passed to the embedding layer

In [None]:
 from tensorflow.keras import layers
embedding = layers.Embedding(input_dim=max_vocab_length,
                            output_dim=128,
                             input_length=max_length
                            )


In [None]:
# Get a random sentence from the training set
random_sentence = random.choice(train_sentences)
print(f"original text: \n{random_sentence} ")
random_sentence_vectorized = text_vectorizer(random_sentence)
sample_embed = embedding(random_sentence_vectorized)
random_sentence_vectorized,sample_embed

In [None]:
# check out a single token's embedding
sample_embed[0], sample_embed[0].shape,random_sentence_vectorized[14]

## Modeling a text dataset (running a series of experiment)
Now we've a got way to turn our text sequences into numbers, it's time to start building a series of modelling experiments.
we'll start with a baseline and move on from there.

* Model 0: Naive Bayes(baseline)
* Model 1: Feed-Forwared neural Network(dense Model)
* Model 2: LSTM model(RNN)
* Model 3: GRU model(RNN)
* Model 4: Bidirectional-LSTM model(RNN)
* Model 5:1D Convolutional Neural Network(CNN)
* Model 6: TensorFlow Hub pretrained Feature Extractor(using transofer learning for NLP)
* Model 7: same as model 6 with 10% of training data

  How we are going to approach all of these?

  Use the standard steps in modelling with tensorflow:
  * Create a model
  * Build a model
  * Fit a model
  * Evaluate a model
  

### Model 0: Getting a baseline 
As with all machine learning modelling experiments, it's important to create a baseline model so you've got a benchmark for future experiment to build on

In [None]:
# !pip install scikit-learn
import sklearn
print(sklearn.__version__)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
# Create tokenization and modelling pipeline 
model_0 = Pipeline([
    ("tfidf",TfidfVectorizer()), # convert words into numbers using tfidf
    ("clf",MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0_history = model_0.fit(train_sentences,train_labels)

In [None]:
model_0_history

In [None]:
# Evaluate our baseline model
baseline_score = model_0.score(val_sentences,val_labels)
baseline_score

In [None]:
print(f"Our baseline model achievs an accuracy of: {baseline_score}")
train_df.target.value_counts()

In [None]:
# make predictions
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:2],val_sentences[:2]

### Creating an evaluation function for our model experiments
we could evaluate all of our model's predictions with different metrics every time,however, this will be cumbersome and could easily be fixed using function
Let's create one to compare our model's predictions with the truth labels using the following metrics:
* Accuracy
* Percision
* Recall
* F1-score

In [None]:
from sklearn.metrics import accuracy_score,precision_recall_fscore_support
def calculate_results(y_true, y_preds):
    """
    Calculates model accuracy, recall, precision and f1-score
    of a binary classification model.
    """
    model_accuracy = accuracy_score(y_true, y_preds) * 100
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(
        y_true, y_preds, average="weighted"
    )
    model_results = {
        "accuracy": model_accuracy,
        "precision": model_precision,
        "recall": model_recall,
        "f1": model_f1,
    }
    return model_results


In [None]:
baseline_results = calculate_results(y_true=val_labels,y_preds=baseline_preds)

In [None]:
baseline_results

### Model 1: A simple dense model

In [None]:
# Create a directory to save tensorboard logs
SAVE_DIR = "model_logs"
# Bild model with the Functional API
from tensorflow.keras import layers
inputs = layers.Input(shape=(),dtype=tf.string,name="input_layer")
x = text_vectorizer(inputs) # turn the input text into number
x = embedding(x) # create an embedding of the numberized inputs
x = layers.GlobalAveragePooling1D(name="globalAverage_layer")(x)
# x = layers.Flatten()(x)
outputs = layers.Dense(1,activation="sigmoid",name="output_layer")(x)
model_1 = tf.keras.Model(inputs,outputs,name="model_1_dense")

In [None]:
model_1.summary()

In [None]:
# compile the model
model_1.compile(loss="binary_crossentropy",
               optimizer=tf.keras.optimizers.Adam(),
               metrics=["accuracy"])

In [None]:
# Fit the model
model_1_history = model_1.fit(x=train_sentences,
                             y=train_labels,
                             epochs=5,
                             validation_data=(val_sentences,val_labels),
                             callbacks=[create_tensorboard_callback("TensorBoard","model_1_dense")])

In [None]:
plot_loss_curves(model_1_history)

In [None]:
model_1.evaluate(val_sentences,val_labels)

In [None]:
model_1_pred_probs = model_1.predict(val_sentences)

In [None]:
model_1_pred_probs.shape,val_sentences[:5],model_1_pred_probs[:5]


In [None]:
# convert model prediction probablity to a label format
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:20]

In [None]:
model_1_results = calculate_results(y_true=val_labels,y_preds=model_1_preds)

In [None]:
model_1_results

In [None]:
baseline_results

In [None]:
import numpy as np
np.array(list(model_1_results.values())) > np.array(list(baseline_results.values()))

## Visualizing Learned Embeddings

In [None]:
# Get the vocabulary from the text vectorization layer
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

In [None]:
# Model 1 summary
model_1.summary()

In [None]:
# get the weight metrics of embedding layer
# (these are the numberical representations of each token in our training data, which have been learned for 5 epochs)
print(f"total parameter of embedding weights: {len(embedding.get_weights()[0])}")
embed_weights = model_1.get_layer("embedding_1").get_weights()[0]
print(embed_weights[:20])
print(f"shape of embedding: {embed_weights.shape} ")

Now we've got the embedding matrix our mdoel has learned to represent our tokens, let's see how we can visualize it.
to do so, TensorFlow as a handy tool called projector.
and tensorflow alos has an incredible guide on word embeddings themselves.

In [None]:
words_in_vocab[:10]

In [None]:
# Create embedding files from tensorflow word embedding docs
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [None]:
import tensorboard
print(tensorboard.__version__)

In [None]:
#docs_infra: no_execute
%load_ext tensorboard
%tensorboard --logdir /kaggle/working/TensorBoard/model_1_dense

## Recurrent Neural Networks (RNN's)
RNN's are useful sequence data.

The promise of a recurrent neural network is to use the ***`representation`*** of a previous input to aid the representation of a later input.

### model 2: LSTM
LSTM = long short-term memory (one of the most popular LSTM cells)
OUr structure of an RNN typically looks like this:
```
Input(text) -> Tokenize -> Embedding -> Layers(RNNs/Dense) -> Output(label probability)
```

In [None]:
# Create an LSTM model
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,),dtype="string")
x = text_vectorizer(inputs)
# print(x.shape)
x = embedding(x)
# print(x.shape)
# x = layers.LSTM(units=64,return_sequences=True)(x)
# print(x.shape)
x = layers.LSTM(64)(x)
# print(x.shape)
# x = layers.Dense(64,activation="relu")(x)
outputs = layers.Dense(1,activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs,outputs,name="model_2_LSTM")

In [None]:
model_2.summary()

In [None]:
model_2.compile(loss="binary_crossentropy",
               optimizer=tf.keras.optimizers.Adam(),
               metrics=["accuracy"])

In [None]:
model_2_history = model_2.fit(train_sentences,
                             train_labels,
                             epochs=5,
                             validation_data=(val_sentences,val_labels),
                             callbacks=[create_tensorboard_callback(SAVE_DIR,"model_2_LSTM")])

In [None]:
# make prediction with LSTM model
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]


In [None]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

In [None]:
val_labels[:10]

In [None]:
# Calculate model 2 results
model_2_results = calculate_results(y_true=val_labels,y_preds=model_2_preds)
model_2_results

In [None]:
baseline_results

In [None]:
from math import e
print(e)
def tanh_func(z):
    a = (e**z - e**(-z))/(e**z + e**(-z)) # tensorflow don't know math and python label code durring computational graph, it must be tensorflow optrational for derivation of gradients
    return a
    # return (tf.exp(z) - tf.exp(-z)) / (tf.exp(z) + tf.exp(-z))
    # return tf.math.tanh(z)
tanh_func(1.)

### build GRU model 3
Another popular and effective RNN component is the GRU or gated recurrent unit.
The GRU cell has similar features to an LSTM cell but has less parameters.

In [None]:
inputs = layers.Input(shape=(1,),dtype=tf.string,name="input_layer")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(units=64,activation=tanh_func)(x)
# x = layers.Lambda(tanh_func)(x)
outputs = layers.Dense(1,activation="sigmoid",name="output_layer")(x)
model_3 = tf.keras.Model(inputs,outputs)


In [None]:
model_3.summary()

In [None]:
# compile the model
model_3.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

In [None]:
model_3_history = model_3.fit(train_sentences,
                             train_labels,
                             epochs=5,
                             validation_data=(val_sentences,val_labels),
                             callbacks=[create_tensorboard_callback(SAVE_DIR,"model_3_GRU")])

In [None]:
plot_loss_curves(model_3_history)

In [None]:
model_3_pred_probs = model_3.predict(val_sentences)

In [None]:
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

In [None]:
model_3_results = calculate_results(y_true=val_labels,y_preds=model_3_preds)
model_3_results

### Model 4: Bidirectional RNN
Normal RNN go from left to right (just like you'd read and English sentence), however, a bidirectional RNN goes from right to left as well as left to right.


In [None]:
# Build a bidirectional RNN in tensorflow
import tensorflow as tf
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,),dtype=tf.string)
x = text_vectorizer(inputs)
print(x.shape)
x = embedding(x)
print(x.shape)
# x = layers.Bidirectional(layers.LSTM(64,return_sequences=True))(x)
# print(x.shape)
x = layers.Bidirectional(layers.LSTM(64))(x)
print(x.shape)
outputs = layers.Dense(1,activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs,outputs)


In [None]:
model_4.summary()

In [None]:
model_4.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

In [None]:
model_4_history = model_4.fit(train_sentences,
                             train_labels,
                             epochs=5,
                             validation_data=(val_sentences,val_labels),
                             callbacks=[create_tensorboard_callback(SAVE_DIR,"model_4_bidirectional")])

In [None]:
plot_loss_curves(model_4_history)

In [None]:
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]

In [None]:
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

In [None]:
model_4_results = calculate_results(y_true=val_labels,y_preds=model_4_preds)
model_4_results