In [None]:
# import libraries
!pip install sumy parsel pycountry tensorflow_hub nltk transformers tensorflow
!pip install -q nltk pycountry sentence-transformers
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt
import re
import random
import nltk
import os
import tensorflow as tf
import pycountry


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, MaxPool1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from keras.models import Model
from sklearn.utils import shuffle

nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bassam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bassam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
def load_data(data_dir, subset='train'):
    """
    Load IMDB dataset from given directory.
    
    Args:
        data_dir (str): Path to 'data' folder containing 'train' and 'test'.
        subset (str): 'train' or 'test' (default 'train')
    
    Returns:
        pd.DataFrame: DataFrame with columns ['review', 'sentiment']
    """
    reviews = []
    labels = []

    for label in ['pos', 'neg']:
        folder_path = os.path.join(data_dir, subset, label)  # include subset folder
        if not os.path.exists(folder_path):
            raise FileNotFoundError(f"Folder not found: {folder_path}")
        for filename in os.listdir(folder_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as f:
                    reviews.append(f.read())
                    labels.append(1 if label == 'pos' else 0)  # 1 for positive, 0 for negative

    return pd.DataFrame({'review': reviews, 'sentiment': labels})

# Example usage
data_dir = r"C:\Users\bassam\OneDrive\Desktop\task_2\data"
train_df = load_data(data_dir, subset='train')
test_df = load_data(data_dir, subset='test')


In [37]:
test_df.drop_duplicates(inplace=True)
train_df.drop_duplicates(inplace=True)

In [42]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24904 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     24904 non-null  object
 1   sentiment  24904 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 583.7+ KB


In [41]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24801 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     24801 non-null  object
 1   sentiment  24801 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 581.3+ KB


In [43]:
test_ready=test_df.copy()
train_ready=train_df.copy()

In [45]:
train_ready = shuffle(train_ready, random_state=42).reset_index(drop=True)

# -----------------------------------------------------
# Helper functions for text augmentation
# -----------------------------------------------------
#wordnet
from nltk.corpus import wordnet

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lem in syn.lemmas():
            if lem.name().lower() != word.lower():
                synonyms.add(lem.name().replace("_", " "))
    return list(synonyms)

def synonym_replacement(sentence, n=1):
    words = sentence.split()
    if len(words) < 2:
        return sentence

    candidates = [w for w in words if get_synonyms(w)]
    if not candidates:
        return sentence

    for _ in range(n):
        word = random.choice(candidates)
        synonym = random.choice(get_synonyms(word))
        words = [synonym if w == word else w for w in words]

    return " ".join(words)

def random_deletion(sentence, p=0.1):
    words = sentence.split()
    if len(words) == 1:
        return sentence
    new_words = [w for w in words if random.random() > p]
    return " ".join(new_words) if new_words else random.choice(words)

def random_swap(sentence, n=1):
    words = sentence.split()
    if len(words) < 2:
        return sentence

    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]

    return " ".join(words)

def augment_text(text):
    choice = random.choice(["synonym", "swap", "delete"])

    if choice == "synonym":
        return synonym_replacement(text, n=1)
    elif choice == "swap":
        return random_swap(text, n=1)
    elif choice == "delete":
        return random_deletion(text, p=0.15)

    return text

# -----------------------------------------------------
# STEP 2: Oversample minority classes with augmentation
# -----------------------------------------------------

class_counts = train_ready["sentiment"].value_counts()
max_count = class_counts.max()
augmented_rows = []
for i in range(len(train_ready)):
    row = train_ready.iloc[i]
    new_desc = augment_text(row["review"])
    augmented_rows.append({"review": new_desc, "sentiment": row["sentiment"]})

aug_df = pd.DataFrame(augmented_rows)
train_ready = shuffle(pd.concat([train_ready, aug_df], ignore_index=True), random_state=42)


In [46]:
train_ready.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49808 entries, 262 to 15795
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     49808 non-null  object
 1   sentiment  49808 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


In [47]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24904 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     24904 non-null  object
 1   sentiment  24904 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 583.7+ KB


In [48]:
# Combine all preprocessing steps


# Initialize stop_words and lemmatizer globally for efficiency
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):

    # 1. Convert text to lowercase
    def to_lower(text):
        return text.lower()

    # 2. Remove punctuation and numbers
    def remove_punctuation_numbers(text):
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = re.sub(r'\b(\d+)(st|nd|rd|th)\b', '', text)  # Remove Ordered Numbering
        text = re.sub(r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\b', '', text)
        return text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation

    # 3. Tokenize text into words
    def tokenize(text):
        return nltk.word_tokenize(text)

    # 4. Modified remove_stopwords function: now operates on a list of tokens
    def remove_stopwords(tokens):
        return [word for word in tokens if word not in stop_words]

    # 5. Apply lemmatization to reduce words to their base (dictionary) form
    def apply_lemmatization(tokens):
        return [lemmatizer.lemmatize(word) for word in tokens]

    text = to_lower(text)
    text = remove_punctuation_numbers(text)

    # Original country removal logic, now applied to the string before tokenization
    for c in pycountry.countries:
        # Using re.escape for robustness against special characters in country names
        text = re.sub(r'\b' + re.escape(c.name.lower()) + r'\b', '', text)

    tokens = tokenize(text) # Tokenize the cleaned string
    tokens = remove_stopwords(tokens) # Call the modified remove_stopwords with tokens
    tokens = apply_lemmatization(tokens) # Apply lemmatization to the tokens
    return " ".join(tokens)


# 6- Apply Preprocessing
train_ready["review"] = train_ready["review"].apply(preprocess_text)
test_ready["review"] = test_ready["review"].apply(preprocess_text)

In [50]:
test_df['review'][0]

"I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge."

In [49]:
test_ready['review'][0]

'went saw movie last night coaxed friend mine ill admit reluctant see knew ashton kutcher able comedy wrong kutcher played character jake fischer well kevin costner played ben randall professionalism sign good movie toy emotion one exactly entire theater sold overcome laughter first half movie moved tear second half exiting theater saw many woman tear many full grown men well trying desperately let anyone see cry movie great suggest go see judge'

In [2]:
df=pd.read_csv('data/CSV/balanced_preprocessed_data.csv')

In [66]:
X_train = train_ready["review"].values
X_test  = test_ready["review"].values

y_train = train_ready["sentiment"].values
y_test  = test_ready["sentiment"].values


In [67]:
#safety Check
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

print("\nTraining labels distribution:")
print(pd.Series(y_train).value_counts())

print("\nTest labels distribution:")
print(pd.Series(y_test).value_counts())

assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)
print("Data is correctly aligned!")


X_train shape: (49808,)
y_train shape: (49808,)
X_test shape: (24801,)
y_test shape: (24801,)

Training labels distribution:
1    24944
0    24864
Name: count, dtype: int64

Test labels distribution:
1    12440
0    12361
Name: count, dtype: int64
Data is correctly aligned!


In [68]:
# Create tokenizer
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)


X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)


max_len = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding="post")

In [70]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)

X_train_pad, y_train = smote.fit_resample(
    X_train_pad,
    y_train
)

In [73]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, LSTM, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

# ==========================
# Model Definition
# ==========================
model = Sequential([
    
    Embedding(20000, 128, input_length=max_len, mask_zero=True),

    Conv1D(64, 5, activation="relu", kernel_regularizer=l2(0.002)),
    Dropout(0.5),

    LSTM(32, return_sequences=True, kernel_regularizer=l2(0.002)),
    Dropout(0.5),

    GlobalMaxPooling1D(),

    
    Dense(16, activation="relu", kernel_regularizer=l2(0.002)),
    Dropout(0.5),

    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=Adam(learning_rate=1e-4),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)


callbacks = [
    EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True, verbose=1),
    ModelCheckpoint("best_cnn_lstm_model.h5", save_best_only=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-6, verbose=1)
]


history = model.fit(
    X_train_pad,
    y_train,
    epochs=10,           
    batch_size=32,
    validation_split=0.2,
    callbacks=callbacks
)


Epoch 1/10


  else:


[1m1247/1248[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 44ms/step - accuracy: 0.5212 - loss: 0.9281



[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 47ms/step - accuracy: 0.5213 - loss: 0.9279 - val_accuracy: 0.8225 - val_loss: 0.6066 - learning_rate: 1.0000e-04
Epoch 2/10
[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.8565 - loss: 0.4738



[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 45ms/step - accuracy: 0.8565 - loss: 0.4737 - val_accuracy: 0.9045 - val_loss: 0.3810 - learning_rate: 1.0000e-04
Epoch 3/10
[1m1247/1248[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 42ms/step - accuracy: 0.9339 - loss: 0.2994



[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 45ms/step - accuracy: 0.9339 - loss: 0.2994 - val_accuracy: 0.9234 - val_loss: 0.3302 - learning_rate: 1.0000e-04
Epoch 4/10
[1m1247/1248[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 43ms/step - accuracy: 0.9588 - loss: 0.2360



[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 45ms/step - accuracy: 0.9588 - loss: 0.2360 - val_accuracy: 0.9385 - val_loss: 0.2925 - learning_rate: 1.0000e-04
Epoch 5/10
[1m1247/1248[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 43ms/step - accuracy: 0.9725 - loss: 0.2025



[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 45ms/step - accuracy: 0.9725 - loss: 0.2025 - val_accuracy: 0.9421 - val_loss: 0.2455 - learning_rate: 1.0000e-04
Epoch 6/10
[1m1247/1248[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 42ms/step - accuracy: 0.9797 - loss: 0.1475



[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 45ms/step - accuracy: 0.9797 - loss: 0.1475 - val_accuracy: 0.9439 - val_loss: 0.2240 - learning_rate: 1.0000e-04
Epoch 7/10
[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.9833 - loss: 0.1320



[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 45ms/step - accuracy: 0.9833 - loss: 0.1320 - val_accuracy: 0.9479 - val_loss: 0.2127 - learning_rate: 1.0000e-04
Epoch 8/10
[1m1247/1248[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 48ms/step - accuracy: 0.9875 - loss: 0.1175



[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 51ms/step - accuracy: 0.9875 - loss: 0.1175 - val_accuracy: 0.9506 - val_loss: 0.1927 - learning_rate: 1.0000e-04
Epoch 9/10
[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 45ms/step - accuracy: 0.9904 - loss: 0.1022 - val_accuracy: 0.9509 - val_loss: 0.1954 - learning_rate: 1.0000e-04
Epoch 10/10
[1m1247/1248[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 43ms/step - accuracy: 0.9916 - loss: 0.0967



[1m1248/1248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 45ms/step - accuracy: 0.9916 - loss: 0.0967 - val_accuracy: 0.9524 - val_loss: 0.1857 - learning_rate: 1.0000e-04
Restoring model weights from the end of the best epoch: 10.


In [74]:
test_loss, test_acc = model.evaluate(X_test_pad, y_test)
print("Test Accuracy:", test_acc)


[1m776/776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.8452 - loss: 0.4905
Test Accuracy: 0.8325470685958862


In [76]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype("int64")

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


[1m776/776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step
Confusion Matrix:
 [[10085  2276]
 [ 1877 10563]]

Classification Report:

              precision    recall  f1-score   support

           0       0.84      0.82      0.83     12361
           1       0.82      0.85      0.84     12440

    accuracy                           0.83     24801
   macro avg       0.83      0.83      0.83     24801
weighted avg       0.83      0.83      0.83     24801



In [79]:
sample = ["This movie was amazing"]
sample_seq = tokenizer.texts_to_sequences(sample)
sample_pad = pad_sequences(sample_seq, maxlen=max_len, padding="post")

prediction = model.predict(sample_pad)
prediction

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step


array([[0.94440687]], dtype=float32)