# Lab | Sequence Modeling with LSTM

In [3]:
import json
import tensorflow as tf
import csv
import random
import numpy as np

from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers


embedding_dim = 100
max_length = 16
trunc_type='post' # trunc at the end
padding_type='post' # post is pad at the end
oov_tok = "<OOV>" # this is so you dont get errors on words not in the vocab, OOV - out of vocabulary
training_size= 16000 #Your dataset size here. Experiment using smaller values (i.e. 16000), but don't forget to train on at least 160000 to see the best effects
test_portion=.1

corpus = []


In [4]:
from google.colab import files

# Trigger file upload dialog
uploaded = files.upload()

Saving training.1600000.processed.noemoticon.csv to training.1600000.processed.noemoticon.csv


In [5]:
import os
os.listdir('/content')

['.config', 'training.1600000.processed.noemoticon.csv', 'sample_data']

In [6]:
# The uploaded file name (it should match exactly as uploaded)
filename = "training.1600000.processed.noemoticon.csv"

num_sentences = 0
corpus = []

# Read the CSV file
with open(filename, encoding='latin-1') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        list_item = []
        # row[0] contains the label (0 or 4), row[5] contains the text
        label = 0 if row[0] == '0' else 1
        text = row[5]
        list_item.append(text)
        list_item.append(label)

        num_sentences += 1
        corpus.append(list_item)

print(f"Number of sentences: {num_sentences}")
print(f"First 5 items in the corpus: {corpus[:5]}")

Number of sentences: 1600000
First 5 items in the corpus: [["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D", 0], ["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", 0], ['@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds', 0], ['my whole body feels itchy and like its on fire ', 0], ["@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. ", 0]]


In [7]:
# !wget --no-check-certificate \
#     https://storage.googleapis.com/laurencemoroney-blog.appspot.com/training_cleaned.csv \
#     -O /tmp/training_cleaned.csv

# num_sentences = 0

# #with open("/tmp/training_cleaned.csv") as csvfile:

# with open("..\Downloads\training.1600000.processed.noemoticon.cvs") as cvsfile:
#     reader = csv.reader(csvfile, delimiter=',')
#     for row in reader:
#       # Your Code here. Create list items where the first item is the text, found in row[5], and the second is the label. Note that the label is a '0' or a '4' in the text. When it's the former, make
#       # your label to be 0, otherwise 1. Keep a count of the number of sentences in num_sentences
#         list_item=[]
#         # YOUR CODE HERE
#         num_sentences = num_sentences + 1
#         corpus.append(list_item)


In [8]:
print(num_sentences)
print(len(corpus))
print(corpus[1])

# Expected Output:
# 1600000
# 1600000
# ["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", 0]

1600000
1600000
["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", 0]


In [9]:
sentences = []
labels = []
random.shuffle(corpus)

for x in range(training_size):
    sentences.append(corpus[x][0]) #seperating setnences and labels in teh training data
    labels.append(corpus[x][1])

# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
vocab_size = len(word_index)

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Split data into training and testing sets
split = int(test_portion * training_size)

test_sequences = padded[:split]
training_sequences = padded[split:]
test_labels = labels[:split]
training_labels = labels[split:]


In [10]:
print(vocab_size)
print(word_index['i'])
# Expected Output
# 138858
# 1
#Dooin said it will be different

26579
1


In [11]:
# # Note this is the 100 dimension version of GloVe from Stanford
# # I unzipped and hosted it on my site to make this notebook easier
# !wget --no-check-certificate \
#     https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
#     -O /tmp/glove.6B.100d.txt
# embeddings_index = {};
# with open('/tmp/glove.6B.100d.txt') as f:
#     for line in f:
#         values = line.split();
#         word = values[0];
#         coefs = np.asarray(values[1:], dtype='float32');
#         embeddings_index[word] = coefs;

# embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word);
#     if embedding_vector is not None:
#         embeddings_matrix[i] = embedding_vector;

In [None]:
embeddings_index = {}
embeddings_file = '/tmp/glove.6B.100d.txt'  # Replace with your actual path

with open(embeddings_file, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;


In [None]:
print(len(embeddings_matrix))
# Expected Output
# 138859

In [None]:
training_sequences = np.array(training_sequences)
training_labels = np.array(training_labels)
test_sequences = np.array(test_sequences)
test_labels = np.array(test_labels)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    # YOUR CODE HERE - experiment with combining different types, such as convolutions and LSTMs

    Conv1D(filters = 64, kernel size = 5, acitvation = 'relu')
    MaxPooling1D(pool_size = 4),
    Dropout(0.3),

    LSTM(units=64)
    Dropout(0.2),

    LSTM(units=32),
    Dropout(0.2),

    Dense(units=64, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy',tf.keras.metrics.MeanAbsoluteError()])
model.summary()

num_epochs = 50
history = model.fit(training_sequences, training_labels, epochs=num_epochs, validation_data=(test_sequences, test_labels), verbose=2)

print("Training Complete")


In [None]:
import matplotlib.image  as mpimg
import matplotlib.pyplot as plt

#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['accuracy']
val_acc=history.history['val_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r')
plt.plot(epochs, val_acc, 'b')
plt.title('Training and validation accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(["Accuracy", "Validation Accuracy"])

plt.figure()

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r')
plt.plot(epochs, val_loss, 'b')
plt.title('Training and validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(["Loss", "Validation Loss"])

plt.figure()


# Expected Output
# A chart where the validation loss does not increase sharply!