<a href="https://colab.research.google.com/github/nicoleolivetto/MMD_Final_Project/blob/main/MMDS_VR481171.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

LSTM

In [None]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping

# Load the data
imdb_data = pd.read_csv('IMDB Dataset.csv')

import nltk
nltk.download('stopwords')

# Data preprocessing
tokenizer = ToktokTokenizer()
stopwords_list = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenize
    tokens = tokenizer.tokenize(text.lower())
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords_list]
    return ' '.join(tokens)

imdb_data['clean_review'] = imdb_data['review'].apply(preprocess_text)

# Tokenization and padding
max_length = 100
vocab_size = 8000

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(imdb_data['clean_review'])

sequences = tokenizer.texts_to_sequences(imdb_data['clean_review'])
X = pad_sequences(sequences, maxlen=max_length)
y = np.array(imdb_data['sentiment'].map({'positive': 1, 'negative': 0}))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model definition
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=200))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model training
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Model evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  text = BeautifulSoup(text, "html.parser").get_text()


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Test Loss: 0.298311710357666, Test Accuracy: 0.8776666522026062
Accuracy: 0.8776666666666667
              precision    recall  f1-score   support

           0       0.87      0.88      0.88      7411
           1       0.88      0.88      0.88      7589

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000

[[6514  897]
 [ 938 6651]]


CNN

In [None]:
# CNN Model
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.layers import Reshape
from keras.layers import Conv1D, MaxPooling1D, Flatten

# Define the CNN model
model_cnn = Sequential()

# Embedding layer
model_cnn.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=100))

# First convolutional layer
model_cnn.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))

# Second convolutional layer
model_cnn.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))

# Flatten the output of the convolutional layers
model_cnn.add(Flatten())

# Fully connected layer
model_cnn.add(Dense(units=64, activation='relu'))

# Dropout layer
model_cnn.add(Dropout(0.5))

# Output layer
model_cnn.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model_cnn.summary()


# Training CNN model
history_cnn = model_cnn.fit(X_train, y_train, epochs=8, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Evaluation CNN model
loss_cnn, accuracy_cnn = model_cnn.evaluate(X_test, y_test)
print(f'CNN Test Loss: {loss_cnn}, CNN Test Accuracy: {accuracy_cnn}')
y_pred_cnn_prob = model_cnn.predict(X_test)
y_pred_cnn = (y_pred_cnn_prob > 0.5).astype(int)

# Evaluation metrics for CNN
print("CNN Model Accuracy:", accuracy_score(y_test, y_pred_cnn))
print("CNN Model Classification Report:\n", classification_report(y_test, y_pred_cnn))
print("CNN Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred_cnn))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          800000    
                                                                 
 conv1d (Conv1D)             (None, 98, 128)           38528     
                                                                 
 max_pooling1d (MaxPooling1  (None, 49, 128)           0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 47, 64)            24640     
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 23, 64)            0         
 g1D)                                                            
                                                                 
 flatten (Flatten)           (None, 1472)             

LSTM-CNN

In [None]:
from keras.layers import LSTM, Bidirectional

# Define the LSTM-CNN model
model_lstm_cnn = Sequential()

# Embedding layer (assuming you already have it defined)
model_lstm_cnn.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))

# LSTM layer
model_lstm_cnn.add(Bidirectional(LSTM(units=200, return_sequences=True)))

# Convolutional layers
model_lstm_cnn.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model_lstm_cnn.add(MaxPooling1D(pool_size=2))
model_lstm_cnn.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model_lstm_cnn.add(MaxPooling1D(pool_size=2))

# Flatten the output of the convolutional layers
model_lstm_cnn.add(Flatten())

# Fully connected layer
model_lstm_cnn.add(Dense(units=64, activation='relu'))

# Dropout layer
model_lstm_cnn.add(Dropout(0.5))

# Output layer
model_lstm_cnn.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model_lstm_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model_lstm_cnn.summary()

# Training LSTM-CNN model
history_lstm_cnn = model_lstm_cnn.fit(X_train, y_train, epochs=6, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Evaluation LSTM-CNN model
loss_lstm_cnn, accuracy_lstm_cnn = model_lstm_cnn.evaluate(X_test, y_test)
print(f'LSTM-CNN Test Loss: {loss_lstm_cnn}, LSTM-CNN Test Accuracy: {accuracy_lstm_cnn}')
y_pred_lstm_cnn_prob = model_lstm_cnn.predict(X_test)
y_pred_lstm_cnn = (y_pred_lstm_cnn_prob > 0.5).astype(int)

# Evaluation metrics for LSTM-CNN
print("LSTM-CNN Model Accuracy:", accuracy_score(y_test, y_pred_lstm_cnn))
print("LSTM-CNN Model Classification Report:\n", classification_report(y_test, y_pred_lstm_cnn))
print("LSTM-CNN Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lstm_cnn))


STRESS-TEST
1. NOISE

In [None]:
#Adding noise

import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping

# Load the data
imdb_data = pd.read_csv('IMDB Dataset.csv')

import nltk
nltk.download('stopwords')

# Data preprocessing
tokenizer = ToktokTokenizer()
stopwords_list = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenize
    tokens = tokenizer.tokenize(text.lower())
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords_list]
    return ' '.join(tokens)

imdb_data['clean_review'] = imdb_data['review'].apply(preprocess_text)

# Add Gaussian noise to word embeddings
def add_noise_to_embeddings(embeddings, noise_factor):
    noise = np.random.normal(loc=0.0, scale=noise_factor, size=embeddings.shape)
    noisy_embeddings = embeddings + noise
    return noisy_embeddings

# Tokenization and padding
max_length = 100
vocab_size = 8000

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(imdb_data['clean_review'])

sequences = tokenizer.texts_to_sequences(imdb_data['clean_review'])
X = pad_sequences(sequences, maxlen=max_length)
y = np.array(imdb_data['sentiment'].map({'positive': 1, 'negative': 0}))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Add noise to word embeddings
noise_factor = 1  # Adjust the noise factor as desired
noisy_X_train = add_noise_to_embeddings(X_train, noise_factor)


# Model definition
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=200))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model training
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(noisy_X_train, y_train, epochs=5, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Model evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# CNN Model
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.layers import Reshape
from keras.layers import Conv1D, MaxPooling1D, Flatten

# Define the CNN model
model_cnn = Sequential()

# Embedding layer
model_cnn.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=100))

# First convolutional layer
model_cnn.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))

# Second convolutional layer
model_cnn.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))

# Flatten the output of the convolutional layers
model_cnn.add(Flatten())

# Fully connected layer
model_cnn.add(Dense(units=64, activation='relu'))

# Dropout layer
model_cnn.add(Dropout(0.5))

# Output layer
model_cnn.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model_cnn.summary()


# Training CNN model
history_cnn = model_cnn.fit(noisy_X_train, y_train, epochs=8, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Evaluation CNN model
loss_cnn, accuracy_cnn = model_cnn.evaluate(X_test, y_test)
print(f'CNN Test Loss: {loss_cnn}, CNN Test Accuracy: {accuracy_cnn}')
y_pred_cnn_prob = model_cnn.predict(X_test)
y_pred_cnn = (y_pred_cnn_prob > 0.5).astype(int)

# Evaluation metrics for CNN
print("CNN Model Accuracy:", accuracy_score(y_test, y_pred_cnn))
print("CNN Model Classification Report:\n", classification_report(y_test, y_pred_cnn))
print("CNN Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred_cnn))

from keras.layers import LSTM, Bidirectional

# Define the LSTM-CNN model
model_lstm_cnn = Sequential()

# Embedding layer (assuming you already have it defined)
model_lstm_cnn.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))

# LSTM layer
model_lstm_cnn.add(Bidirectional(LSTM(units=200, return_sequences=True)))

# Convolutional layers
model_lstm_cnn.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model_lstm_cnn.add(MaxPooling1D(pool_size=2))
model_lstm_cnn.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model_lstm_cnn.add(MaxPooling1D(pool_size=2))

# Flatten the output of the convolutional layers
model_lstm_cnn.add(Flatten())

# Fully connected layer
model_lstm_cnn.add(Dense(units=64, activation='relu'))

# Dropout layer
model_lstm_cnn.add(Dropout(0.5))

# Output layer
model_lstm_cnn.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model_lstm_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model_lstm_cnn.summary()

# Training LSTM-CNN model
history_lstm_cnn = model_lstm_cnn.fit(noisy_X_train, y_train, epochs=6, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Evaluation LSTM-CNN model
loss_lstm_cnn, accuracy_lstm_cnn = model_lstm_cnn.evaluate(X_test, y_test)
print(f'LSTM-CNN Test Loss: {loss_lstm_cnn}, LSTM-CNN Test Accuracy: {accuracy_lstm_cnn}')
y_pred_lstm_cnn_prob = model_lstm_cnn.predict(X_test)
y_pred_lstm_cnn = (y_pred_lstm_cnn_prob > 0.5).astype(int)

# Evaluation metrics for LSTM-CNN
print("LSTM-CNN Model Accuracy:", accuracy_score(y_test, y_pred_lstm_cnn))
print("LSTM-CNN Model Classification Report:\n", classification_report(y_test, y_pred_lstm_cnn))
print("LSTM-CNN Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lstm_cnn))


2. IMBALANCED DATASET

In [None]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, Bidirectional
from keras.callbacks import EarlyStopping

# Load the data
imdb_data = pd.read_csv('IMDB Dataset.csv')

import nltk
nltk.download('stopwords')

# Data preprocessing
tokenizer = ToktokTokenizer()
stopwords_list = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenize
    tokens = tokenizer.tokenize(text.lower())
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords_list]
    return ' '.join(tokens)

imdb_data['clean_review'] = imdb_data['review'].apply(preprocess_text)

# Filter positive and negative reviews
positive_reviews = imdb_data[imdb_data['sentiment'] == 'positive'].sample(n=15000, random_state=42)
negative_reviews = imdb_data[imdb_data['sentiment'] == 'negative'].sample(n=10000, random_state=42)

# Concatenate positive and negative reviews to create imbalanced dataset
imbalanced_data = pd.concat([positive_reviews, negative_reviews], ignore_index=True)

# Tokenization and padding
max_length = 100
vocab_size = 8000

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(imbalanced_data['clean_review'])

sequences = tokenizer.texts_to_sequences(imbalanced_data['clean_review'])
X = pad_sequences(sequences, maxlen=max_length)
y = np.array(imbalanced_data['sentiment'].map({'positive': 1, 'negative': 0}))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model definitions
embedding_dim = 100
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# LSTM Model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
lstm_model.add(LSTM(units=200))
lstm_model.add(Dense(units=64, activation='relu'))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(units=1, activation='sigmoid'))
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# CNN Model
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_length))
cnn_model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(units=64, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(units=1, activation='sigmoid'))
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# LSTM-CNN Model
lstm_cnn_model = Sequential()
lstm_cnn_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
lstm_cnn_model.add(Bidirectional(LSTM(units=200, return_sequences=True)))
lstm_cnn_model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
lstm_cnn_model.add(MaxPooling1D(pool_size=2))
lstm_cnn_model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
lstm_cnn_model.add(MaxPooling1D(pool_size=2))
lstm_cnn_model.add(Flatten())
lstm_cnn_model.add(Dense(units=64, activation='relu'))
lstm_cnn_model.add(Dropout(0.5))
lstm_cnn_model.add(Dense(units=1, activation='sigmoid'))
lstm_cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train models
lstm_history = lstm_model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2, callbacks=[early_stopping])
cnn_history = cnn_model.fit(X_train, y_train, epochs=8, batch_size=128, validation_split=0.2, callbacks=[early_stopping])
lstm_cnn_history = lstm_cnn_model.fit(X_train, y_train, epochs=6, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Evaluate models
loss_lstm, accuracy_lstm = lstm_model.evaluate(X_test, y_test)
loss_cnn, accuracy_cnn = cnn_model.evaluate(X_test, y_test)
loss_lstm_cnn, accuracy_lstm_cnn = lstm_cnn_model.evaluate(X_test, y_test)

print(f'LSTM Test Loss: {loss_lstm}, LSTM Test Accuracy: {accuracy_lstm}')
print(f'CNN Test Loss: {loss_cnn}, CNN Test Accuracy: {accuracy_cnn}')
print(f'LSTM-CNN Test Loss: {loss_lstm_cnn}, LSTM-CNN Test Accuracy: {accuracy_lstm_cnn}')


In [None]:
# Evaluation metrics for LSTM
y_pred_lstm = (lstm_model.predict(X_test) > 0.5).astype(int)
print("LSTM Model Accuracy:", accuracy_score(y_test, y_pred_lstm))
print("LSTM Model Classification Report:\n", classification_report(y_test, y_pred_lstm))
print("LSTM Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lstm))

# Evaluation metrics for CNN
y_pred_cnn = (cnn_model.predict(X_test) > 0.5).astype(int)
print("CNN Model Accuracy:", accuracy_score(y_test, y_pred_cnn))
print("CNN Model Classification Report:\n", classification_report(y_test, y_pred_cnn))
print("CNN Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred_cnn))

# Evaluation metrics for LSTM-CNN
y_pred_lstm_cnn = (lstm_cnn_model.predict(X_test) > 0.5).astype(int)
print("LSTM-CNN Model Accuracy:", accuracy_score(y_test, y_pred_lstm_cnn))
print("LSTM-CNN Model Classification Report:\n", classification_report(y_test, y_pred_lstm_cnn))
print("LSTM-CNN Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lstm_cnn))

3.BATCH SIZE

In [None]:
# 8,24,512 batch size

#batch size

import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.layers import Conv1D, MaxPooling1D, Flatten, Bidirectional

# Load the data
imdb_data = pd.read_csv('IMDB Dataset.csv')

import nltk
nltk.download('stopwords')

# Data preprocessing
tokenizer = ToktokTokenizer()
stopwords_list = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenize
    tokens = tokenizer.tokenize(text.lower())
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords_list]
    return ' '.join(tokens)

imdb_data['clean_review'] = imdb_data['review'].apply(preprocess_text)

# Tokenization and padding
max_length = 100
vocab_size = 8000

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(imdb_data['clean_review'])

sequences = tokenizer.texts_to_sequences(imdb_data['clean_review'])
X = pad_sequences(sequences, maxlen=max_length)
y = np.array(imdb_data['sentiment'].map({'positive': 1, 'negative': 0}))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model definition - LSTM
embedding_dim = 100
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model_lstm.add(LSTM(units=200))
model_lstm.add(Dense(units=64, activation='relu'))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(units=1, activation='sigmoid'))
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model training - LSTM
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
history_lstm = model_lstm.fit(X_train, y_train, epochs=5, batch_size=512, validation_split=0.2, callbacks=[early_stopping])

# Model evaluation - LSTM
loss_lstm, accuracy_lstm = model_lstm.evaluate(X_test, y_test)
print(f'LSTM Test Loss: {loss_lstm}, LSTM Test Accuracy: {accuracy_lstm}')
y_pred_lstm = (model_lstm.predict(X_test) > 0.5).astype(int)
print("LSTM Model Accuracy:", accuracy_score(y_test, y_pred_lstm))
print("LSTM Model Classification Report:\n", classification_report(y_test, y_pred_lstm))
print("LSTM Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lstm))

# Model definition - CNN
model_cnn = Sequential()
model_cnn.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=100))
model_cnn.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Flatten())
model_cnn.add(Dense(units=64, activation='relu'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(units=1, activation='sigmoid'))
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model training - CNN
history_cnn = model_cnn.fit(X_train, y_train, epochs=8, batch_size=512, validation_split=0.2, callbacks=[early_stopping])

# Model evaluation - CNN
loss_cnn, accuracy_cnn = model_cnn.evaluate(X_test, y_test)
print(f'CNN Test Loss: {loss_cnn}, CNN Test Accuracy: {accuracy_cnn}')
y_pred_cnn = (model_cnn.predict(X_test) > 0.5).astype(int)
print("CNN Model Accuracy:", accuracy_score(y_test, y_pred_cnn))
print("CNN Model Classification Report:\n", classification_report(y_test, y_pred_cnn))
print("CNN Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred_cnn))

# Model definition - LSTM-CNN
model_lstm_cnn = Sequential()
model_lstm_cnn.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model_lstm_cnn.add(Bidirectional(LSTM(units=200, return_sequences=True)))
model_lstm_cnn.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model_lstm_cnn.add(MaxPooling1D(pool_size=2))
model_lstm_cnn.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model_lstm_cnn.add(MaxPooling1D(pool_size=2))
model_lstm_cnn.add(Flatten())
model_lstm_cnn.add(Dense(units=64, activation='relu'))
model_lstm_cnn.add(Dropout(0.5))
model_lstm_cnn.add(Dense(units=1, activation='sigmoid'))
model_lstm_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model training - LSTM-CNN
history_lstm_cnn = model_lstm_cnn.fit(X_train, y_train, epochs=6, batch_size=512, validation_split=0.2, callbacks=[early_stopping])

# Model evaluation - LSTM-CNN
loss_lstm_cnn, accuracy_lstm_cnn = model_lstm_cnn.evaluate(X_test, y_test)
print(f'LSTM-CNN Test Loss: {loss_lstm_cnn}, LSTM-CNN Test Accuracy: {accuracy_lstm_cnn}')
y_pred_lstm_cnn = (model_lstm_cnn.predict(X_test) > 0.5).astype(int)
print("LSTM-CNN Model Accuracy:", accuracy_score(y_test, y_pred_lstm_cnn))
print("LSTM-CNN Model Classification Report:\n", classification_report(y_test, y_pred_lstm_cnn))
print("LSTM-CNN Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lstm_cnn))

