In [None]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
from tqdm import tqdm

#use for naive_bayes
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,SnowballStemmer
from nltk.tokenize import TweetTokenizer
import pdb
from nltk.corpus import twitter_samples
import numpy as np
import pandas as pd
import nltk
from os import getcwd

from tensorflow.keras import layers
from tensorflow.keras import losses

nltk.download('stopwords')

#use for LSTM and WordEmbedding Model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.initializers import glorot_uniform
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from  nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

#NEW DATA

In [None]:
batch_size = 32
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=42
)

raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/test",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=42
)

raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/test",
    batch_size=batch_size
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [None]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

x = []
y = []
for text_batch, label_batch in raw_train_ds.take(len(raw_train_ds)):
    for i in range(len(text_batch)):
        x.append(custom_standardization(text_batch[i]).numpy().decode("utf-8"))
        y.append(int(label_batch[i].numpy().item()))

for text_batch, label_batch in raw_test_ds.take(len(raw_test_ds)):
    for i in range(len(text_batch)):
        x.append(custom_standardization(text_batch[i]).numpy().decode("utf-8"))
        y.append(int(label_batch[i].numpy().item()))

In [None]:
size_train = int(0.8 * len(x))
size_test = int(0.8 * len(y))

train_x = np.array(x[:size_train])
train_y = np.array(y[:size_train])

test_x = np.array(x[size_train:])
test_y = np.array(y[size_train:])

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_x)

x_train = pad_sequences(tokenizer.texts_to_sequences(train_x), maxlen = 250)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_x), maxlen = 250)

In [None]:
#create y_data
encoder = LabelEncoder()
encoder.fit(train_y)

y_train = encoder.transform(train_y)
y_test = encoder.transform(test_y)
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [None]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('/content/drive/MyDrive/Big_Data/custom_model/glove/glove.6B.50d.txt')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def sentences_to_indices(X,word_to_index,max_len):
    m = len(X) #number training_example

    X_indices = np.zeros((m,max_len))
    for i in tqdm(range(m)):
        j = 0
        for w in X[i]:
            if w in word_to_index:
               X_indices[i,j] = word_to_index[w]
               j = j + 1
    return X_indices

def pretrained_embedding_layer(word_to_vec_map,word_to_index): 
    any_word = list(word_to_vec_map.keys())[0]
    emb_dim = word_to_vec_map[any_word].shape[0]
    vocab_size = len(word_to_index) + 1

    #Initialize the embedding matrix
    emb_matrix = np.zeros((vocab_size, emb_dim))

    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]
    embedding_layer = Embedding(vocab_size,emb_dim, input_length=250, trainable = False)
    embedding_layer.build((None,))        

    embedding_layer.set_weights([emb_matrix])

    return embedding_layer

embedding_layer = pretrained_embedding_layer(word_to_vec_map,word_to_index)

model_LSTM = tf.keras.Sequential([
    tf.keras.Input((250,),dtype = 'int32'),
    embedding_layer,
    tf.keras.layers.LSTM(128,return_sequences = True),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(128,return_sequences = False),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1,activation = 'sigmoid')
])

model_LSTM.summary()
model_LSTM.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 250, 50)           20000050  
                                                                 
 lstm_2 (LSTM)               (None, 250, 128)          91648     
                                                                 
 dropout_2 (Dropout)         (None, 250, 128)          0         
                                                                 
 lstm_3 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 20,223,411
Trainable params: 223,361
Non

In [None]:
history = model_LSTM.fit(
    x_train,
    y_train, 
    epochs = 30,
    validation_split = 0.1,
    verbose = 1,
    shuffle = True 
    )

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f9ad09e2c50>

In [None]:
loss, accuracy = model_LSTM.evaluate(x_test,y_test)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  1.1373573541641235
Accuracy:  0.7448889017105103


In [None]:
history_dict = model_LSTM.history

import matplotlib.pyplot as plt

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

TypeError: ignored

In [None]:
history_dict.

{'epochs': 1, 'steps': 282, 'verbose': 1}