<a href="https://colab.research.google.com/github/mahdiSheykhiGithub/IMDB-Reviews-NLP/blob/main/IMDB_CNN_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries and Load data

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Conv1D, MaxPool1D, Flatten, Dense, Dropout
from tensorflow.keras.models import Sequential

In [2]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IMDB Reviews/IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocessing

In [3]:
def text_cleaner(text):
  # HTML to text format
  soup = BeautifulSoup(text)
  text = soup.get_text()
  return text

In [4]:
data['reveiw'] = data['review'].apply(text_cleaner)

  soup = BeautifulSoup(text)


In [5]:
enc = LabelEncoder()
y = enc.fit_transform(data['sentiment'])

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['reveiw'])

In [7]:
vocab_len = len(tokenizer.word_index) + 1
vocab_len

126507

In [8]:
encoded = tokenizer.texts_to_sequences(data['review'])

In [9]:
len(encoded[1])

164

In [10]:
list_len = []
for seq in encoded:
  list_len.append(len(seq))

max_len = max(list_len)

In [11]:
max_len

2493

In [12]:
X = pad_sequences(encoded, maxlen=max_len, padding='post')

In [13]:
X.shape

(50000, 2493)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Modeling


In [51]:
reg = tf.keras.regularizers.l2(0.003)

model = Sequential()
model.add(Embedding(input_dim=vocab_len, output_dim=100, input_length=max_len, embeddings_regularizer=reg))
model.add(Dropout(0.5))
model.add(Conv1D(filters=16, kernel_size=4, padding='same', activation='relu'))
model.add(Dropout(0.25))
model.add(MaxPool1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [52]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 2493, 100)         12650700  
                                                                 
 dropout_34 (Dropout)        (None, 2493, 100)         0         
                                                                 
 conv1d_10 (Conv1D)          (None, 2493, 16)          6416      
                                                                 
 dropout_35 (Dropout)        (None, 2493, 16)          0         
                                                                 
 max_pooling1d_10 (MaxPooli  (None, 1246, 16)          0         
 ng1D)                                                           
                                                                 
 flatten_9 (Flatten)         (None, 19936)             0         
                                                     

In [53]:
opt = tf.optimizers.Adam(learning_rate=0.001)
loss = tf.losses.BinaryCrossentropy()
metric = tf.metrics.BinaryAccuracy()
model.compile(optimizer=opt, loss=loss, metrics=metric)

In [55]:
model.fit(X_train, y_train, batch_size=50, epochs=10, validation_split=0.2, verbose=2)

Epoch 1/10
640/640 - 12s - loss: 0.6149 - binary_accuracy: 0.8933 - val_loss: 0.6307 - val_binary_accuracy: 0.8935 - 12s/epoch - 19ms/step
Epoch 2/10
640/640 - 12s - loss: 0.6070 - binary_accuracy: 0.8945 - val_loss: 0.6088 - val_binary_accuracy: 0.8965 - 12s/epoch - 18ms/step
Epoch 3/10
640/640 - 12s - loss: 0.6039 - binary_accuracy: 0.8952 - val_loss: 0.5923 - val_binary_accuracy: 0.8970 - 12s/epoch - 19ms/step
Epoch 4/10
640/640 - 12s - loss: 0.6036 - binary_accuracy: 0.8979 - val_loss: 0.5857 - val_binary_accuracy: 0.9004 - 12s/epoch - 19ms/step
Epoch 5/10
640/640 - 12s - loss: 0.5981 - binary_accuracy: 0.8963 - val_loss: 0.5919 - val_binary_accuracy: 0.9016 - 12s/epoch - 19ms/step
Epoch 6/10
640/640 - 12s - loss: 0.5951 - binary_accuracy: 0.8968 - val_loss: 0.6120 - val_binary_accuracy: 0.8972 - 12s/epoch - 18ms/step
Epoch 7/10
640/640 - 12s - loss: 0.5971 - binary_accuracy: 0.8978 - val_loss: 0.5851 - val_binary_accuracy: 0.9031 - 12s/epoch - 18ms/step
Epoch 8/10
640/640 - 12s - 

<keras.src.callbacks.History at 0x7a56c760f2e0>

In [56]:
model.evaluate(X_test, y_test)



[0.5877217054367065, 0.9043999910354614]