<a href="https://colab.research.google.com/github/mrhamedani/Deep-learning-projects-Tensorflow/blob/main/5_imdb_RNN(LSTM)_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RNN(LSTM)
Recurrent Neural Network or RNN are models that are mostly used for nlp (while CNNs were for computer vision)

RNNs have two basic problems: Vanishing/Exploding Gradient, which makes W either too small or too large.

Usually, this model is not used, and instead RNN is used to improve LSTM or Long Short-Term Memory.

Note: sigmoid or tanh is used instead of Relu in RNNs

This dataset contains people's opinions about movies, and our goal is to do binary classification and distinguish positive and negative opinions.

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz   # download dataset from link
!tar -xvzf aclImdb_v1.tar.gz # extract dataset

In [30]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras import layers
import pandas as pd
import re
import string

ModuleNotFoundError: No module named 'tensorflow.metrics'

In [3]:
batch_size = 32
seed = 42

In [7]:
raw_train_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/train',batch_size=batch_size,validation_split=0.2,subset='training',seed=seed)
# Delete the unsup directory in aclImdb/train
raw_val_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/train',batch_size=batch_size,validation_split=0.2,subset='validation',seed=seed)
raw_test_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/test',batch_size=batch_size)


Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [16]:
print(raw_train_ds.class_names)  # To see the names of the classes
print(list(raw_train_ds.take(1)))  # for To see data  or one batch (include 32 elements)  # 0 is negative and 1 is positive

['neg', 'pos']
[(<tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'Belmondo is a tough cop. He goes after a big-time drug dealer (played by Henry Silva, normally a great villain - see "Sharky\'s Machine"; but here he is clearly dubbed, and because of that he lacks his usual charisma). He goes to the scuzziest places of Paris and Marseilles, asks for some names, beats up some people, gets the names, goes to more scuzzy places, asks for more names, beats up more people, etc. The whole movie is punch after punch after punch. It seems that the people who made it had no other ambition than to create the French equivalent of "Dirty Harry". Belmondo, who was 50 here, does perform some good stunts at the beginning; apart from those, "Le Marginal" is a violent, episodic, trite, shallow and forgettable cop movie. (*1/2)',
       b'Wow. The only people reviewing this positively are the Carpenter apologists. I know a lot of those. The guys that\'ll watch John Carpenter squat on celluloid and 

In [15]:
raw_train_ds.element_spec # This shows that we have two tensors. The first is comments and the second is labels (+-)


(TensorSpec(shape=(None,), dtype=tf.string, name=None),
 TensorSpec(shape=(None,), dtype=tf.int32, name=None))

### also shape is none --> for batch size
This always indicates the presence of bath size.
Important note: Because we have to send two values ​​to use embedding(batch & lengh ). We add a dim to the tensor using the 'expand_dim' method

# standardization text by regex

In [17]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data) # convert to lowercase
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ') # replace <br /> tag with space
  return tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation),'') # remove punctuations

In [21]:
max_features = 10000
max_len = 250  # max number of words in a comment  # padding to max_len
vectorization_layer = layers.TextVectorization(standardize=custom_standardization,max_tokens=max_features,output_mode='int',output_sequence_length=max_len)
vectorization_layer.adapt(raw_train_ds.map(lambda x, y: x)) # for remove labels and return only comments

## adding dim

In [22]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorization_layer(text), label

In [23]:
# text_batch, label_batch = next(iter(raw_train_ds))
# first_rewiew, first_label = text_batch[0], label_batch[0]

# print("Review", first_rewiew)
# print("Label", raw_train_ds.class_names[first_label])
# print("Vectorized review", vectorize_text(first_rewiew, first_label))

Review tf.Tensor(b"The idea ia a very short film with a lot of information. Interesting, entertaining and leaves the viewer wanting more. The producer has produced a short film of excellent quality that cannot be compared to any other short film that I have seen. I have rated this film at the highest possible rating. I also recommend that it is shown to office managers and business people in any establishment. What comes out of it is the fact that people with ideas are never listened to, their voice is never heard. It is a lesson to be learned by any office that wants to go forward. I hope that the produced will produce a second part to this 'idea'. I look forward to viewing the sequence. Once again congrats to Halaqah media in producing a film of excellence and quality with a lesson in mind.", shape=(), dtype=string)
Label pos
Vectorized review (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[   2,  314,    1,    4,   52,  350,   19,   16,    4,  171,    5,
        1574,  217,

In [24]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [26]:
model = tf.keras.Sequential([
    layers.Embedding(max_features + 1, 16), # 16 is the embedding dimension # +1 for unknown words
    layers.Dropout(0.2), # dropout to avoid overfitting ,randomly drop 20% of the input units
    layers.GlobalAveragePooling1D(), # average pooling to reduce the dimensionality of the output
    layers.Dropout(0.2),
    layers.Dense(1)])

In [31]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),optimizer='adam',metrics=[tf.metrics.BinaryAccuracy(threshold=0.0)])
model.fit(train_ds,validation_data=val_ds,epochs=10)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - binary_accuracy: 0.5822 - loss: 0.6829 - val_binary_accuracy: 0.7344 - val_loss: 0.6147
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - binary_accuracy: 0.7538 - loss: 0.5825 - val_binary_accuracy: 0.8116 - val_loss: 0.4982
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - binary_accuracy: 0.8163 - loss: 0.4707 - val_binary_accuracy: 0.8382 - val_loss: 0.4238
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - binary_accuracy: 0.8477 - loss: 0.3992 - val_binary_accuracy: 0.8556 - val_loss: 0.3776
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - binary_accuracy: 0.8648 - loss: 0.3529 - val_binary_accuracy: 0.8596 - val_loss: 0.3520
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - binary_accuracy: 0.8786

<keras.src.callbacks.history.History at 0x7fe8d2cf0110>

In [33]:
loss, accuracy = model.evaluate(test_ds)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - binary_accuracy: 0.8691 - loss: 0.3195
Loss:  0.31719687581062317
Accuracy:  0.8702800273895264


In [34]:
# If we want to add a new sentence to the model for prediction, it must be tokenized, so we will create a new model that will do this.
export_model = tf.keras.Sequential([
  vectorization_layer,
  model,
  layers.Activation('sigmoid')])

In [39]:
export_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy'])
# not need to fit the model


In [41]:
examples = tf.constant(["The movie was great!", "The movie was ok.", "The movie was terrible..."])[:, tf.newaxis]
export_model.predict(examples)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


array([[0.5727068 ],
       [0.38789216],
       [0.3000899 ]], dtype=float32)