In [13]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [2]:
df = pd.read_csv('fake_reviews_dataset.csv')
df.head()

Unnamed: 0,category,rating,text,label
0,Home_and_Kitchen,5.0,"Love this! Well made, sturdy, and very comfor...",1
1,Home_and_Kitchen,5.0,"love it, a great upgrade from the original. I...",1
2,Home_and_Kitchen,5.0,This pillow saved my back. I love the look and...,1
3,Home_and_Kitchen,1.0,"Missing information on how to use it, but it i...",1
4,Home_and_Kitchen,5.0,Very nice set. Good quality. We have had the s...,1


In [16]:
def preprocess_and_split_data(df):
    # Load stop words and initialize stemmer
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    # Preprocessing function: remove stop words and apply stemming
    def preprocess_text(text):
        words = text.split()
        filtered_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]
        return ' '.join(filtered_words)

    # Split the data into training and testing sets (80% train, 20% test)
    X = df['text']  # Features (text data)
    y = df['label']  # Labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Apply preprocessing to the training and testing sets separately
    X_train = X_train.apply(preprocess_text)
    X_test = X_test.apply(preprocess_text)

    # Tokenize and pad sequences
    max_words = 10000  # Hardcoded maximum number of words
    max_len = 100      # Hardcoded maximum sequence length
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(X_train)

    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

    # Return the processed datasets and tokenizer
    return X_train_pad, X_test_pad, y_train, y_test, tokenizer

In [15]:
# Example usage for preprocessing
X_train, X_test, y_train, y_test, tokenizer = preprocess_and_split_data(df)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

Train shape: (32420, 100) (32420,)
Test shape: (8106, 100) (8106,)


In [18]:
# Example of how to use the preprocessed data for a simple LSTM model

# Simple LSTM model for testing
model = Sequential([
    Embedding(input_dim=10000, output_dim=32, input_length=100),  # Smaller embedding dimension for simplicity
    LSTM(32),  # Fewer units in the LSTM layer
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

from tensorflow.keras.callbacks import EarlyStopping

# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=3,          # Stop training after 3 epochs with no improvement
    restore_best_weights=True  # Restore the weights of the best epoch
)

# Train the model with early stopping
history = model.fit(
    X_train,
    y_train,
    epochs=20,  # Set a high number of epochs; early stopping will terminate earlier if needed
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping]  # Add the early stopping callback
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

# Predict if a new review is fake or real
def predict_review(review, tokenizer, max_len=100):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    words = review.split()
    filtered_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]
    preprocessed_review = ' '.join(filtered_words)
    review_seq = tokenizer.texts_to_sequences([preprocessed_review])  # Tokenize the review
    review_pad = pad_sequences(review_seq, maxlen=max_len, padding='post')  # Pad the sequence
    prediction = model.predict(review_pad)
    return "Fake Review" if prediction[0] > 0.5 else "Real Review"

# Example usage
example_review = "This product is amazing and works perfectly!"
print("Prediction:", predict_review(example_review, tokenizer))

Epoch 1/20
[1m811/811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - accuracy: 0.5604 - loss: 0.6747 - val_accuracy: 0.6504 - val_loss: 0.6293
Epoch 2/20
[1m811/811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.6714 - loss: 0.6215 - val_accuracy: 0.5629 - val_loss: 0.6670
Epoch 3/20
[1m811/811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.7270 - loss: 0.5366 - val_accuracy: 0.8794 - val_loss: 0.2870
Epoch 4/20
[1m  1/811[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m17s[0m 21ms/step - accuracy: 0.9375 - loss: 0.1912

KeyboardInterrupt: 