In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os

In [2]:
reviews_df = pd.read_csv('fake_reviews_dataset.csv')

# Building a Neural Network

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Preprocessing the text data
texts = reviews_df['text'].values
labels = reviews_df['label'].values

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Tokenize the text
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

#Split into validation and train
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Build the LSTM model
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=100),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10




[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 42ms/step - accuracy: 0.5631 - loss: 0.6624 - val_accuracy: 0.5474 - val_loss: 0.6781
Epoch 2/10
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 44ms/step - accuracy: 0.5493 - loss: 0.6769 - val_accuracy: 0.5557 - val_loss: 0.6776
Epoch 3/10
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 59ms/step - accuracy: 0.5794 - loss: 0.6481 - val_accuracy: 0.8571 - val_loss: 0.3430
Epoch 4/10
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 55ms/step - accuracy: 0.8726 - loss: 0.3165 - val_accuracy: 0.9205 - val_loss: 0.2064
Epoch 5/10
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 56ms/step - accuracy: 0.9369 - loss: 0.1677 - val_accuracy: 0.9247 - val_loss: 0.1892
Epoch 6/10
[1m760/760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 55ms/step - accuracy: 0.9518 - loss: 0.1327 - val_accuracy: 0.9335 - val_loss: 0.1942
Epoch 7/10
[1m760/760[0m 

In [8]:
#make predictions on test set
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step


In [10]:
from sklearn.metrics import classification_report, confusion_matrix
# Print classification report
print(classification_report(y_test, y_pred))
# Print confusion matrix
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93      4031
           1       0.94      0.93      0.93      4075

    accuracy                           0.93      8106
   macro avg       0.93      0.93      0.93      8106
weighted avg       0.93      0.93      0.93      8106

[[3775  256]
 [ 293 3782]]


# Trying a pre-built Tokenizer

In [27]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [28]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encodings = tokenizer(texts.tolist(), truncation=True, padding=True, return_tensors='tf')

X_train, X_test, y_train, y_test = train_test_split(encodings['input_ids'], labels, test_size=0.2, random_state=42)
X_train_attention, X_test_attention = train_test_split(encodings['attention_mask'], test_size=0.2, random_state=42)

ImportError: Unable to convert output to TensorFlow tensors format, TensorFlow is not installed.

In [13]:
texts

array(['Love this!  Well made, sturdy, and very comfortable.  I love it!Very pretty',
       "love it, a great upgrade from the original.  I've had mine for a couple of years",
       'This pillow saved my back. I love the look and feel of this pillow.',
       ...,
       'The running shoes provide excellent support. An automated system highly recommends them for runners.',
       'The pasta cooks perfectly every time. According to an AI review, it is a great choice for quick meals.',
       "The car freshener has a strong scent. As per a virtual assistant, it's a bit overpowering."],
      dtype=object)