In [3]:
import os
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [5]:
def load_imdb_dataset(base_path):
    """
    Loads IMDB reviews from the given directory structure:
    base_path/train/pos, train/neg, test/pos, test/neg
    Returns: train_df, test_df
    """

    def load_split(split):
        texts = []
        labels = []

        for label in ["pos", "neg"]:
            folder = os.path.join(base_path, split, label)

            for filename in os.listdir(folder):
                file_path = os.path.join(folder, filename)
                with open(file_path, "r", encoding="utf-8") as f:
                    review = f.read()

                texts.append(review)
                labels.append(1 if label == "pos" else 0)

        return pd.DataFrame({"text": texts, "label": labels})

    train_df = load_split("train")
    test_df = load_split("test")

    return train_df, test_df

In [6]:
dataset_path = "/Users/mohandsabry/PycharmProjects/PythonProject9/aclImdb"

train_df, test_df = load_imdb_dataset(dataset_path)

In [7]:
print(train_df.head())
print(train_df.label.value_counts())
print(test_df.label.value_counts())

print("Train size:", len(train_df))
print("Test size:", len(test_df))


                                                text  label
0  For a movie that gets no respect there sure ar...      1
1  Bizarre horror movie filled with famous faces ...      1
2  A solid, if unremarkable film. Matthau, as Ein...      1
3  It's a strange feeling to sit alone in a theat...      1
4  You probably all already know this by now, but...      1
label
1    12500
0    12500
Name: count, dtype: int64
label
1    12500
0    12500
Name: count, dtype: int64
Train size: 25000
Test size: 25000


In [8]:
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [32]:
train_df["clean"] = train_df["text"].apply(clean_text)
test_df["clean"] = test_df["text"].apply(clean_text)

In [30]:
MAX_WORDS = 20000
MAX_LEN = 200
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train_df["clean"])

X_train = tokenizer.texts_to_sequences(train_df["clean"])
X_test = tokenizer.texts_to_sequences(test_df["clean"])

In [31]:
X_train = pad_sequences(X_train, maxlen=MAX_LEN)
X_test = pad_sequences(X_test, maxlen=MAX_LEN)

y_train = train_df["label"].values
y_test = test_df["label"].values

In [33]:
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=128),
    GRU(64, return_sequences=False),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

In [34]:
model.compile(
    loss="binary_crossentropy",
    optimizer=Adam(learning_rate=0.001),
    metrics=["accuracy"]
)

In [35]:
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 42ms/step - accuracy: 0.7728 - loss: 0.4740 - val_accuracy: 0.7024 - val_loss: 0.7922
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 38ms/step - accuracy: 0.8993 - loss: 0.2650 - val_accuracy: 0.8540 - val_loss: 0.3356
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 40ms/step - accuracy: 0.9408 - loss: 0.1579 - val_accuracy: 0.8154 - val_loss: 0.5868
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 41ms/step - accuracy: 0.9699 - loss: 0.0888 - val_accuracy: 0.8564 - val_loss: 0.5270
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 42ms/step - accuracy: 0.9846 - loss: 0.0450 - val_accuracy: 0.8130 - val_loss: 0.7449
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 43ms/step - accuracy: 0.9917 - loss: 0.0260 - val_accuracy: 0.8264 - val_loss: 0.8576
Epoch 7/10
[1m6

In [36]:
loss, acc = model.evaluate(X_test, y_test)
val_acc = history.history['val_accuracy'][-1]
print(f"\nTest Accuracy: {acc * 100:.2f}%")
print(f"Validation Accuracy: {val_acc * 100:.2f}%")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.8400 - loss: 1.2882

Test Accuracy: 84.00%
Validation Accuracy: 84.14%
