<a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/deeplearning.ai/tf/c3_w3_nlp_overfit_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget --no-check-certificate --quiet\
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/training_cleaned.csv

!wget --no-check-certificate --quiet\
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt

In [None]:
import json
import csv
import random
import numpy as np
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

In [None]:
#@title Enable TPU
use_tpu = False #@param ["False", "True"] {type:"raw"}

if use_tpu:
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tpu_spec = tpu.cluster_spec().as_dict()['worker']
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print("Running on TPU:", tpu_spec)
  except ValueError:
    print("ERROR: Not connected to a TPU.")
    raise BaseException("ERROR: Select use_tpu=False.")

In [None]:
embedding_dim = 100
max_length = 16
trunc_type = "post"
padding_type = "post"
oov_tok = "<OOV>"
training_size = 160000
test_portion = 0.1
corpus = []


In [None]:
num_sentences = 0
with open("training_cleaned.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=",")
    for row in reader:
        list_items = [row[5], 0 if row[0] == "0" else 1]
        num_sentences += 1
        corpus.append(list_items)

In [None]:
print(num_sentences)
print(len(corpus))
print(corpus[1])

# Expected Output:
# 1600000
# 1600000
# ["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", 0]

1600000
1600000
["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", 0]


In [None]:
sentences = []
labels = []
random.shuffle(corpus)

for x in range(training_size):
    sentence, label = corpus[x]
    sentences.append(sentence)
    labels.append(label)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
vocab_size = len(word_index)

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(
    sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type
)

split = int(test_portion * training_size)

training_sequences = np.array(padded[:split])
training_labels = np.array(labels[:split])
test_sequences = np.array(padded[split:])
test_labels = np.array(labels[split:])


In [None]:
print(vocab_size)
print(word_index['i'])
# Expected Output
# 138858
# 1

138274
1


In [None]:
embeddings_index = {}
with open('glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word, *coefs = values
        coefs = np.asarray(coefs, dtype="float32")
        embeddings_index[word] = coefs

embeddings_matrix = np.zeros((vocab_size + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

In [None]:
print(len(embeddings_matrix))
# Expected Output
# 138859

138275


In [None]:
def create_model():
    model = Sequential([
        layers.Embedding(
            vocab_size + 1,
            embedding_dim,
            input_length=max_length,
            weights=[embeddings_matrix],
            trainable=True,
        ),
        layers.Dropout(0.2),
        layers.Conv1D(64, 5, activation="relu"),
        layers.MaxPooling1D(pool_size=4),
        layers.LSTM(64),
        layers.Dense(1, activation="sigmoid"),
    ])
    return model

if use_tpu:
    with tpu_strategy.scope():
        model = create_model()
else:
    model = create_model()
    
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 16, 100)           13827500  
_________________________________________________________________
dropout (Dropout)            (None, 16, 100)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 12, 64)            32064     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 3, 64)             0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                33024     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 13,892,653
Trainable params: 13,892,653
Non-trainable params: 0
____________________________________________

In [None]:
 len(test_sequences), len(test_labels) 

(144000, 144000)

In [None]:
num_epochs = 50
history = model.fit(
    training_sequences, 
    training_labels, 
    epochs=num_epochs, 
    validation_data=(test_sequences, test_labels), 
    verbose=2
)

Epoch 1/50


KeyboardInterrupt: ignored

In [None]:
import matplotlib.image  as mpimg
import matplotlib.pyplot as plt

#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['accuracy']
val_acc=history.history['val_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r')
plt.plot(epochs, val_acc, 'b')
plt.title('Training and validation accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(["Accuracy", "Validation Accuracy"])

plt.figure()

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r')
plt.plot(epochs, val_loss, 'b')
plt.title('Training and validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(["Loss", "Validation Loss"])

plt.figure()

In [None]:
!nvidia-smi

Thu Dec 17 18:17:29 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.45.01    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    38W / 300W |   1123MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces