<a href="https://colab.research.google.com/github/nicolasvazquez95/Aprendiendo_DeepLearning/blob/main/10_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get data - Tokenization and Embedding

In [1]:
# Imports
import tensorflow as tf
from tensorflow import keras
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

# Helper functions
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
import helper_functions as helper

--2022-05-14 20:09:04--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py.2’


2022-05-14 20:09:04 (113 MB/s) - ‘helper_functions.py.2’ saved [10246/10246]



In [2]:
# Get the text dataset
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
helper.unzip_data('nlp_getting_started.zip')

--2022-05-14 20:09:04--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.16.128, 172.217.15.112, 142.251.45.112, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.16.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip.2’


2022-05-14 20:09:04 (153 MB/s) - ‘nlp_getting_started.zip.2’ saved [607343/607343]



In [3]:
# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Shuffle
train_df_shuffled = train_df.sample(frac=1,random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [5]:
# Split data into training and validation sets
from sklearn.model_selection import train_test_split
X = train_df_shuffled['text'].copy()
y = train_df_shuffled['target'].copy()

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=42)

In [6]:
## Tokenization
from tensorflow.keras.layers import TextVectorization

# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [7]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(X_train)

In [8]:
# Create sample sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[252,   3, 244,   4,  13, 727,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [9]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}") 
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 least common words: ['noahanyname', 'noah', 'no2', 'nnw', 'nno']


In [10]:
## Embedding Layer
embedding = keras.layers.Embedding(input_dim=max_vocab_length,
                                   output_dim=128,
                                   input_length=max_length)

In [11]:
import random
random_sentence = random.choice(X_train)
print(f'Original text:\n {random_sentence}\
\n\nEmbedded version:\n')
#Embedded version
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
 Investigators have said a Virgin Galactic spaceship crash was caused by structural failure after the co-pilot ... http://t.co/PnhPLJHo8E

Embedded version:



<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 4.62494530e-02, -1.62763968e-02,  3.88250984e-02, ...,
          4.19098176e-02, -4.12091501e-02, -1.08600967e-02],
        [ 1.04488954e-02, -4.42673825e-02,  1.40103735e-02, ...,
         -3.80145796e-02, -2.25065947e-02,  1.10257491e-02],
        [ 3.80491056e-02,  2.79198997e-02, -1.27763152e-02, ...,
          6.66715950e-03,  2.51614787e-02,  8.92176479e-03],
        ...,
        [ 2.28354968e-02, -3.70135680e-02, -2.84588225e-02, ...,
         -2.48161200e-02,  2.87180431e-02,  1.14786252e-02],
        [-1.97212826e-02,  4.17185090e-02,  4.91651334e-02, ...,
          2.55341791e-02,  4.24774326e-02, -3.07930354e-02],
        [-4.41196337e-02, -4.81715202e-02,  1.10976398e-05, ...,
          1.91679262e-02,  3.25537585e-02, -3.65661979e-02]]],
      dtype=float32)>

# Model 0 : Naive Bayes (Scikit)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([('tfidf',TfidfVectorizer()),
                    ('clf',MultinomialNB())])
model_0.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [13]:
model_0.score(X_test,y_test)

0.8003502626970228

In [14]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [15]:
# Get baseline results
y_pred_0 = model_0.predict(X_test)

baseline_results = calculate_results(y_true=y_test,
                                     y_pred=y_pred_0)
baseline_results

{'accuracy': 80.03502626970229,
 'f1': 0.7937090801534213,
 'precision': 0.8170270320769228,
 'recall': 0.8003502626970228}

# Model 1 : Simple Dense model

In [16]:
from helper_functions import create_tensorboard_callback
SAVE_DIR = 'model_logs'

# Build model Functional API
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,),dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1,activation='sigmoid')(x)

model_1 = keras.Model(inputs,outputs)

model_1.compile(loss='binary_crossentropy',
                optimizer='Adam',
                metrics=['accuracy'])

In [17]:
model_1.fit(X_train,y_train,
            epochs=5,
            validation_data=(X_test,y_test),
            callbacks=[create_tensorboard_callback(SAVE_DIR,'model_1_dense')]
            )

Saving TensorBoard log files to: model_logs/model_1_dense/20220514-200907
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fac0e6d8790>

In [18]:
# Get results
y_pred_1 = tf.squeeze(tf.round(model_1.predict(X_test)))
calculate_results(y_test,y_pred_1)

{'accuracy': 78.6339754816112,
 'f1': 0.7837160221411318,
 'precision': 0.788345540566855,
 'recall': 0.7863397548161121}

# Visualize learned embeddings

In [19]:
# Get the vocabulary from the text vectorization layer
words = text_vectorizer.get_vocabulary()

In [20]:
model_1.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
Non-train

The weight matrix of the embedding layer contains the numerical representations of each token in our training data, which have been learned for 5 epochs.

In [21]:
embedding_weights = model_1.get_layer('embedding').get_weights()[0]

In [22]:
# Create embedding files for visualization
import io

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embedding_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

try:
  from google.colab import files
  files.download('vectors.tsv')
  files.download('metadata.tsv')
except Exception:
  pass


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Model 2 : LSTM

In [32]:
# Create a LSTM model
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,),dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(64,return_sequences=True)(x) # stacking
x = layers.LSTM(64)(x)
#x = layers.Dense(64,activation='relu')(x)
outputs = layers.Dense(1,activation='sigmoid')(x)

model_2 = tf.keras.Model(inputs,outputs,name='model_2_LSTM')

In [33]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm_4 (LSTM)               (None, 15, 64)            49408     
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 dense_6 (Dense)             (None, 1)                 65        
                                                      

In [34]:
model_2.compile(loss='binary_crossentropy',optimizer='Adam',
                metrics=['accuracy'])

In [35]:
# Fit model
model_2_history = model_2.fit(X_train,
                              y_train,
                              epochs=5,
                              validation_data=(X_test,y_test),
                              callbacks=create_tensorboard_callback(SAVE_DIR,
                                                                    'model_2_LSTM'))

Saving TensorBoard log files to: model_logs/model_2_LSTM/20220514-203302
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [36]:
model_2_preds = tf.squeeze(tf.round(model_2.predict(X_test)))

calculate_results(y_test,model_2_preds)

{'accuracy': 77.75831873905429,
 'f1': 0.7756058685854841,
 'precision': 0.7780506327551178,
 'recall': 0.7775831873905429}

# Model 3: GRU

In [45]:
inputs = layers.Input(shape=(1,),dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64,return_sequences=True)(x)
x = layers.GRU(64)(x)
x = layers.Dense(64,activation='relu')(x)
outputs = layers.Dense(1,activation='sigmoid')(x)

model_3 = tf.keras.Model(inputs,outputs,name='model_3_GRU')

In [41]:
model_3.summary()

Model: "model_3_GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 gru_4 (GRU)                 (None, 15, 64)            37248     
                                                                 
 gru_5 (GRU)                 (None, 64)                24960     
                                                                 
 dense_9 (Dense)             (None, 64)                4160      
                                                       

In [46]:
model_3.compile(loss='binary_crossentropy',optimizer='Adam',
                metrics=['accuracy'])

In [47]:
model_3_history = model_3.fit(X_train,y_train,
            epochs=5,
            validation_data=(X_test,y_test),
            callbacks=create_tensorboard_callback(SAVE_DIR,'model_3_GRU'))

Saving TensorBoard log files to: model_logs/model_3_GRU/20220514-213629
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [48]:
model_3_preds = tf.squeeze(tf.round(model_3.predict(X_test)))

calculate_results(y_test,model_3_preds)

{'accuracy': 76.18213660245185,
 'f1': 0.7615098672552248,
 'precision': 0.7613679324173941,
 'recall': 0.7618213660245184}

# Model 4:

In [50]:
inputs = layers.Input(shape=(1,),dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64,return_sequences=True))(x)
x = layers.Bidirectional(layers.GRU(64))(x)
outputs = layers.Dense(1,activation='sigmoid')(x)

model_4 = tf.keras.Model(inputs,outputs,name='model_4_bidirectional')

In [51]:
model_4.summary()

Model: "model_4_bidirectional"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 bidirectional_2 (Bidirectio  (None, 15, 128)          98816     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              74496     
 nal)                                                            
                                             

In [52]:
model_4.compile(loss='binary_crossentropy',optimizer='Adam',
                metrics=['accuracy'])

In [54]:
model_4_history = model_4.fit(X_train,y_train,
                              epochs=5,
                              validation_data=(X_test,y_test),
                              callbacks=create_tensorboard_callback(SAVE_DIR,'model_4_bidirectional'))

Saving TensorBoard log files to: model_logs/model_4_bidirectional/20220514-220811
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [55]:
model_4_preds = tf.squeeze(tf.round(model_4.predict(X_test)))

calculate_results(y_test,model_4_preds)

{'accuracy': 76.18213660245185,
 'f1': 0.7616724821568109,
 'precision': 0.7615659174014223,
 'recall': 0.7618213660245184}