<a href="https://colab.research.google.com/github/nicolasvazquez95/Aprendiendo_DeepLearning/blob/main/10_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get data - Tokenization and Embedding

In [None]:
# Imports
import tensorflow as tf
from tensorflow import keras
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

# Helper functions
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
import helper_functions as helper

--2022-02-05 12:58:02--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2022-02-05 12:58:02 (112 MB/s) - ‘helper_functions.py’ saved [10246/10246]



In [None]:
# Get the text dataset
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
helper.unzip_data('nlp_getting_started.zip')

--2022-02-05 12:58:03--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.2.112, 142.251.16.128, 172.217.0.48, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.2.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2022-02-05 12:58:03 (164 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [None]:
# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
# Shuffle
train_df_shuffled = train_df.sample(frac=1,random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [None]:
# Split data into training and validation sets
from sklearn.model_selection import train_test_split
X = train_df_shuffled['text'].copy()
y = train_df_shuffled['target'].copy()

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=42)

In [None]:
## Tokenization
from tensorflow.keras.layers import TextVectorization

# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [None]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(X_train)

In [None]:
# Create sample sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[252,   3, 244,   4,  13, 727,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [None]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}") 
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 least common words: ['noahanyname', 'noah', 'no2', 'nnw', 'nno']


In [None]:
## Embedding Layer
embedding = keras.layers.Embedding(input_dim=max_vocab_length,
                                   output_dim=128,
                                   input_length=max_length)

In [None]:
import random
random_sentence = random.choice(X_train)
print(f'Original text:\n {random_sentence}\
\n\nEmbedded version:\n')
#Embedded version
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
 @smallforestelf Umm because a gun stopped the gunman with who was carrying a bomb!

Embedded version:



<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.01632888,  0.01623065, -0.02904266, ..., -0.04150053,
         -0.03810171,  0.01941884],
        [-0.02401965, -0.04933633, -0.02622827, ..., -0.03948786,
          0.04851295, -0.03957596],
        [ 0.03095228, -0.02944252, -0.02811295, ..., -0.03910526,
          0.02980175, -0.02342455],
        ...,
        [ 0.03530388,  0.04819497, -0.0383953 , ...,  0.01326437,
          0.0467864 , -0.04702656],
        [-0.04748287,  0.01862255,  0.02226884, ...,  0.04947635,
          0.00234459,  0.04424647],
        [-0.03722589,  0.02256114,  0.04932388, ...,  0.00825783,
         -0.014006  ,  0.03539244]]], dtype=float32)>

# Model 0 : Naive Bayes (Scikit)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([('tfidf',TfidfVectorizer()),
                    ('clf',MultinomialNB())])
model_0.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [None]:
model_0.score(X_test,y_test)

0.8003502626970228

In [None]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [None]:
# Get baseline results
y_pred_0 = model_0.predict(X_test)

baseline_results = calculate_results(y_true=y_test,
                                     y_pred=y_pred_0)
baseline_results

{'accuracy': 80.03502626970229,
 'f1': 0.7937090801534213,
 'precision': 0.8170270320769228,
 'recall': 0.8003502626970228}

# Model 1 : Simple Dense model

In [None]:
from helper_functions import create_tensorboard_callback
SAVE_DIR = 'model_logs'

# Build model Functional API
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,),dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1,activation='sigmoid')(x)

model_1 = keras.Model(inputs,outputs)

model_1.compile(loss='binary_crossentropy',
                optimizer='Adam',
                metrics=['accuracy'])

In [None]:
model_1.fit(X_train,y_train,
            epochs=5,
            validation_data=(X_test,y_test),
            callbacks=[create_tensorboard_callback(SAVE_DIR,'model_1_dense')]
            )

Saving TensorBoard log files to: model_logs/model_1_dense/20220205-132750
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f6802d0b550>

# Visualize learned embeddings