<a href="https://colab.research.google.com/github/mahdiSheykhiGithub/IMDB-Reviews-NLP/blob/main/IMDB_CNN_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries and Load data

In [33]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Conv1D, MaxPool1D, Flatten, Dense, Dropout
from tensorflow.keras.models import Sequential
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IMDB Reviews/IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocessing

In [5]:
def text_cleaner(text):
  # HTML to text format
  soup = BeautifulSoup(text)
  text = soup.get_text()
  return text

In [6]:
data['reveiw'] = data['review'].apply(text_cleaner)

  soup = BeautifulSoup(text)


In [7]:
enc = LabelEncoder()
y = enc.fit_transform(data['sentiment'])

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['reveiw'])

In [9]:
vocab_len = len(tokenizer.word_index) + 1
vocab_len

126507

In [10]:
encoded = tokenizer.texts_to_sequences(data['review'])

In [11]:
len(encoded[1])

164

In [12]:
list_len = []
for seq in encoded:
  list_len.append(len(seq))

max_len = max(list_len)

In [13]:
max_len

2493

In [14]:
X = pad_sequences(encoded, maxlen=max_len, padding='post')

In [15]:
X.shape

(50000, 2493)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Modeling


In [17]:
reg = tf.keras.regularizers.l2(0.003)

model = Sequential()
model.add(Embedding(input_dim=vocab_len, output_dim=100, input_length=max_len, embeddings_regularizer=reg))
model.add(Dropout(0.5))
model.add(Conv1D(filters=16, kernel_size=4, padding='same', activation='relu'))
model.add(Dropout(0.25))
model.add(MaxPool1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2493, 100)         12650700  
                                                                 
 dropout (Dropout)           (None, 2493, 100)         0         
                                                                 
 conv1d (Conv1D)             (None, 2493, 16)          6416      
                                                                 
 dropout_1 (Dropout)         (None, 2493, 16)          0         
                                                                 
 max_pooling1d (MaxPooling1  (None, 1246, 16)          0         
 D)                                                              
                                                                 
 flatten (Flatten)           (None, 19936)             0         
                                                        

In [19]:
opt = tf.optimizers.Adam(learning_rate=0.001)
loss = tf.losses.BinaryCrossentropy()
metric = tf.metrics.BinaryAccuracy()
model.compile(optimizer=opt, loss=loss, metrics=metric)

In [20]:
model.fit(X_train, y_train, batch_size=50, epochs=25, validation_split=0.2, verbose=2)

Epoch 1/25
640/640 - 18s - loss: 1.4023 - binary_accuracy: 0.7012 - val_loss: 0.6298 - val_binary_accuracy: 0.8804 - 18s/epoch - 29ms/step
Epoch 2/25
640/640 - 12s - loss: 0.6751 - binary_accuracy: 0.8678 - val_loss: 0.6253 - val_binary_accuracy: 0.8882 - 12s/epoch - 18ms/step
Epoch 3/25
640/640 - 12s - loss: 0.6515 - binary_accuracy: 0.8742 - val_loss: 0.6253 - val_binary_accuracy: 0.8905 - 12s/epoch - 18ms/step
Epoch 4/25
640/640 - 12s - loss: 0.6358 - binary_accuracy: 0.8815 - val_loss: 0.6274 - val_binary_accuracy: 0.8855 - 12s/epoch - 18ms/step
Epoch 5/25
640/640 - 12s - loss: 0.6309 - binary_accuracy: 0.8828 - val_loss: 0.5972 - val_binary_accuracy: 0.8974 - 12s/epoch - 19ms/step
Epoch 6/25
640/640 - 12s - loss: 0.6209 - binary_accuracy: 0.8863 - val_loss: 0.6192 - val_binary_accuracy: 0.8984 - 12s/epoch - 19ms/step
Epoch 7/25
640/640 - 12s - loss: 0.6275 - binary_accuracy: 0.8874 - val_loss: 0.6043 - val_binary_accuracy: 0.8945 - 12s/epoch - 18ms/step
Epoch 8/25
640/640 - 12s - 

<keras.src.callbacks.History at 0x78a32369d6f0>

# evaluation

In [21]:
model.evaluate(X_test, y_test)



[0.5874577164649963, 0.8939999938011169]

In [24]:
y_pred = model.predict(X_test)



In [34]:
y_pred = np.round(y_pred)

In [35]:
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.87      0.92      0.90      4951
    positive       0.92      0.86      0.89      5049

    accuracy                           0.89     10000
   macro avg       0.90      0.89      0.89     10000
weighted avg       0.90      0.89      0.89     10000



In [37]:
print(confusion_matrix(y_test, y_pred))

[[4576  375]
 [ 685 4364]]
