In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kurtfischer/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Loading and Preprocessing

In [4]:
df = pd.read_csv('fake_reviews_dataset.csv')
df.head()

Unnamed: 0,category,rating,text,label
0,Home_and_Kitchen,5.0,"Love this! Well made, sturdy, and very comfor...",1
1,Home_and_Kitchen,5.0,"love it, a great upgrade from the original. I...",1
2,Home_and_Kitchen,5.0,This pillow saved my back. I love the look and...,1
3,Home_and_Kitchen,1.0,"Missing information on how to use it, but it i...",1
4,Home_and_Kitchen,5.0,Very nice set. Good quality. We have had the s...,1


In [6]:
def preprocess_and_split_data(df):
    # Load stop words and initialize stemmer
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    # Preprocessing function: remove stop words and apply stemming
    def preprocess_text(text):
        words = text.split()
        filtered_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]
        return ' '.join(filtered_words)

    # Save a copy of the original text
    df['original_text'] = df['text']

    # Split the data into training and testing sets (80% train, 20% test)
    X = df['text']  # Features (text data)
    y = df['label']  # Labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Apply preprocessing to the training and testing sets separately
    X_train = X_train.apply(preprocess_text)
    X_test = X_test.apply(preprocess_text)

    # Tokenize and pad sequences
    max_words = 10000  # Maximum number of words in vocabulary
    max_len = 100      # Max sequence length
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(X_train)

    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

    # Return the processed datasets and tokenizer
    return X_train_pad, X_test_pad, y_train, y_test, tokenizer

In [8]:
X_train, X_test, y_train, y_test, tokenizer = preprocess_and_split_data(df)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (32420, 100)
Test shape: (8106, 100)


# Base CNN Model

In [13]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=64),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [15]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping]
)

Epoch 1/20
[1m811/811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7575 - loss: 0.4824 - val_accuracy: 0.8984 - val_loss: 0.2502
Epoch 2/20
[1m811/811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9320 - loss: 0.1898 - val_accuracy: 0.9042 - val_loss: 0.2433
Epoch 3/20
[1m811/811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9681 - loss: 0.0972 - val_accuracy: 0.9050 - val_loss: 0.2736
Epoch 4/20
[1m811/811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9845 - loss: 0.0522 - val_accuracy: 0.8968 - val_loss: 0.3489
Epoch 5/20
[1m811/811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9922 - loss: 0.0286 - val_accuracy: 0.8970 - val_loss: 0.4232


In [19]:
loss, accuracy = model.evaluate(X_test, y_test)
print("CNN Test Accuracy:", accuracy)

[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8967 - loss: 0.2570
CNN Test Accuracy: 0.8994572162628174


# Varying number of filters and kernel sizes

In [24]:
# Filter and kernel experiment configurations
conv_experiments = [
    {"name": f"filters_{f}_kernel_{k}", "filters": f, "kernel_size": k}
    for f in [64, 128, 256]
    for k in [3, 5, 7]
]

# Store results
conv_results = []

for config in conv_experiments:
    print(f"\nTraining: {config['name']}")
    
    model = Sequential([
        Embedding(input_dim=10000, output_dim=64),
        Conv1D(filters=config["filters"], kernel_size=config["kernel_size"], activation='relu'),
        GlobalMaxPooling1D(),
        Dense(32, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    history = model.fit(
        X_train,
        y_train,
        epochs=20,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0
    )

    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    conv_results.append((config["name"], accuracy))

results_df = pd.DataFrame(conv_results, columns=["Model", "Test Accuracy"])
print(results_df)


Training: filters_64_kernel_3

Training: filters_64_kernel_5

Training: filters_64_kernel_7

Training: filters_128_kernel_3

Training: filters_128_kernel_5

Training: filters_128_kernel_7

Training: filters_256_kernel_3

Training: filters_256_kernel_5

Training: filters_256_kernel_7
                  Model  Test Accuracy
0   filters_64_kernel_3       0.899334
1   filters_64_kernel_5       0.898594
2   filters_64_kernel_7       0.902665
3  filters_128_kernel_3       0.890945
4  filters_128_kernel_5       0.893782
5  filters_128_kernel_7       0.902418
6  filters_256_kernel_3       0.896496
7  filters_256_kernel_5       0.901308
8  filters_256_kernel_7       0.904515


# Varying pooling strategy

In [29]:
from tensorflow.keras.layers import MaxPooling1D, AveragePooling1D

# Pooling layer experiment configurations
pooling_experiments = [
    {"name": "global_max", "pooling": "global"},
    {"name": "max_then_global", "pooling": "max_then_global"},
    {"name": "avg_then_global", "pooling": "avg_then_global"},
]

pooling_results = []

for config in pooling_experiments:
    print(f"\nTraining: {config['name']}")
    
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=64))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))

    # Apply pooling strategy
    if config["pooling"] == "max_then_global":
        model.add(MaxPooling1D(pool_size=2))
    elif config["pooling"] == "avg_then_global":
        model.add(AveragePooling1D(pool_size=2))
    
    model.add(GlobalMaxPooling1D())
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    history = model.fit(
        X_train,
        y_train,
        epochs=20,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0
    )

    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    pooling_results.append((config["name"], accuracy))

pooling_results_df = pd.DataFrame(pooling_results, columns=["Model", "Test Accuracy"])
print(pooling_results_df)


Training: global_max

Training: max_then_global

Training: avg_then_global
             Model  Test Accuracy
0       global_max       0.892549
1  max_then_global       0.900938
2  avg_then_global       0.895386


# Dense layer experimentation

In [34]:
# Dense layer experiment configurations
dense_experiments = [
    {"name": "dense_32", "layers": [("dense", 32)], "dropout": 0.5, "activation": "relu"},
    {"name": "dense_64", "layers": [("dense", 64)], "dropout": 0.5, "activation": "relu"},
    {"name": "dense_128", "layers": [("dense", 128)], "dropout": 0.5, "activation": "relu"},
    {"name": "dense_64_32", "layers": [("dense", 64), ("dense", 32)], "dropout": 0.5, "activation": "relu"},
    {"name": "dense_64_tanh", "layers": [("dense", 64)], "dropout": 0.5, "activation": "tanh"},
    {"name": "dense_64_dropout_low", "layers": [("dense", 64)], "dropout": 0.2, "activation": "relu"},
    {"name": "dense_64_dropout_none", "layers": [("dense", 64)], "dropout": 0.0, "activation": "relu"},
]

dense_results = []

for config in dense_experiments:
    print(f"\nTraining: {config['name']}")
    
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=64))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    
    for layer_type, units in config["layers"]:
        if layer_type == "dense":
            model.add(Dense(units, activation=config["activation"]))
    
    if config["dropout"] > 0:
        model.add(Dropout(config["dropout"]))
    
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    history = model.fit(
        X_train,
        y_train,
        epochs=20,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0
    )

    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    dense_results.append((config["name"], accuracy))

dense_results_df = pd.DataFrame(dense_results, columns=["Model", "Test Accuracy"])
print(dense_results_df)


Training: dense_32

Training: dense_64

Training: dense_128

Training: dense_64_32

Training: dense_64_tanh

Training: dense_64_dropout_low

Training: dense_64_dropout_none
                   Model  Test Accuracy
0               dense_32       0.903775
1               dense_64       0.899457
2              dense_128       0.902788
3            dense_64_32       0.901554
4          dense_64_tanh       0.899827
5   dense_64_dropout_low       0.895263
6  dense_64_dropout_none       0.905502


# Regularization experimentation

In [37]:
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras import regularizers

# Regularization experiment configurations
regularization_experiments = [
    {"name": "dropout_0.0", "dropout": 0.0, "l2_conv": None, "l2_dense": None, "batchnorm": False},
    {"name": "dropout_0.2", "dropout": 0.2, "l2_conv": None, "l2_dense": None, "batchnorm": False},
    {"name": "dropout_0.5", "dropout": 0.5, "l2_conv": None, "l2_dense": None, "batchnorm": False},
    {"name": "l2_reg_dense", "dropout": 0.5, "l2_conv": None, "l2_dense": 0.01, "batchnorm": False},
    {"name": "l2_reg_conv", "dropout": 0.5, "l2_conv": 0.01, "l2_dense": None, "batchnorm": False},
    {"name": "l2_reg_both", "dropout": 0.5, "l2_conv": 0.01, "l2_dense": 0.01, "batchnorm": False},
    {"name": "batchnorm_dense", "dropout": 0.5, "l2_conv": None, "l2_dense": None, "batchnorm": "dense"},
    {"name": "batchnorm_conv_dense", "dropout": 0.5, "l2_conv": None, "l2_dense": None, "batchnorm": "conv_dense"},
]

regularization_results = []

for config in regularization_experiments:
    print(f"\nTraining: {config['name']}")
    
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=64))

    # Conv1D with optional L2
    if config["l2_conv"]:
        model.add(Conv1D(filters=128, kernel_size=5, activation='relu',
                         kernel_regularizer=regularizers.l2(config["l2_conv"])))
    else:
        model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))

    if config["batchnorm"] == "conv_dense":
        model.add(BatchNormalization())

    model.add(GlobalMaxPooling1D())

    # Dense with optional L2
    if config["l2_dense"]:
        model.add(Dense(64, activation='relu',
                        kernel_regularizer=regularizers.l2(config["l2_dense"])))
    else:
        model.add(Dense(64, activation='relu'))

    if config["batchnorm"] in ["dense", "conv_dense"]:
        model.add(BatchNormalization())

    # Optional Dropout
    if config["dropout"] > 0:
        model.add(Dropout(config["dropout"]))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    history = model.fit(
        X_train,
        y_train,
        epochs=20,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0
    )

    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    regularization_results.append((config["name"], accuracy))

regularization_results_df = pd.DataFrame(regularization_results, columns=["Model", "Test Accuracy"])
print(regularization_results_df)


Training: dropout_0.0

Training: dropout_0.2

Training: dropout_0.5

Training: l2_reg_dense

Training: l2_reg_conv

Training: l2_reg_both

Training: batchnorm_dense

Training: batchnorm_conv_dense
                  Model  Test Accuracy
0           dropout_0.0       0.903282
1           dropout_0.2       0.899951
2           dropout_0.5       0.894399
3          l2_reg_dense       0.889835
4           l2_reg_conv       0.892055
5           l2_reg_both       0.893166
6       batchnorm_dense       0.895016
7  batchnorm_conv_dense       0.896003
