<a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/deeplearning.ai/tf/c3_w2_embeddings_hyperparameters_tunning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Embeddings hyperparameters tunning

In [1]:
!wget \
  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json

--2020-11-15 07:14:39--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.13.80, 172.217.7.144, 172.217.12.240, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.13.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘sarcasm.json’


2020-11-15 07:14:39 (107 MB/s) - ‘sarcasm.json’ saved [5643545/5643545]



In [13]:
import json
import numpy as np
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
vocab_size = 10000
embedding_dim = 16
max_lenght = 100
trunc_type='post'
padding_type = 'post'
oov_tok = '<oov>'
training_size = 20000

In [4]:
sentences = []
labels = []

with open('sarcasm.json', 'r') as f:
  datastore = json.load(f)
  for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

len(sentences), len(labels)

(26709, 26709)

In [5]:
training_sentences = sentences[:training_size]
testing_sentences = sentences[training_size:]

training_labels = labels[:training_size]
testing_labels = labels[training_size:]

len(training_sentences), len(testing_sentences), \
len(training_labels), len(testing_labels)

(20000, 6709, 20000, 6709)

In [9]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(
    training_sequences, 
    maxlen=max_lenght, 
    padding=padding_type,
    truncating=trunc_type
)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(
    testing_sequences, 
    maxlen=max_lenght, 
    padding=padding_type,
    truncating=trunc_type
)

In [12]:
training_padded = np.array(training_padded)
trainin_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [14]:
model = tf.keras.Sequential([
  layers.Embedding(vocab_size, embedding_dim, input_length=max_lenght),
  layers.GlobalAveragePooling1D(),
  layers.Dense(24, activation='relu'),
  layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________
