In [None]:
# 1. Data Preprocessing:
# Collect a dataset of consumer reviews labeled as fake or truthful.
# Preprocess the text data by removing stop words, special symbols, and lowercasing the text.
# Extract emotion features from the reviews using lexicon-based methods.
# Tokenize the text into unigrams, bigrams, and trigrams.
# Calculate tf.idf weights for the n-grams.
# Pre-train word embeddings using the Skip-Gram model on a large corpus of text data (e.g., Amazon reviews).

In [28]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from unidecode import unidecode


In [29]:
# The reviews are labelled as fake or real (in the dataset they’re mapped fake (label1) or real (label2)).
# https://medium.com/@lievgarcia/deception-on-amazon-c1e30d977cfd

df = pd.read_csv("../Fake-Amazon-Review-Detection/amazon_reviews.txt", sep = "\t")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   DOC_ID             21000 non-null  int64 
 1   LABEL              21000 non-null  object
 2   RATING             21000 non-null  int64 
 3   VERIFIED_PURCHASE  21000 non-null  object
 4   PRODUCT_CATEGORY   21000 non-null  object
 5   PRODUCT_ID         21000 non-null  object
 6   PRODUCT_TITLE      21000 non-null  object
 7   REVIEW_TITLE       21000 non-null  object
 8   REVIEW_TEXT        21000 non-null  object
dtypes: int64(2), object(7)
memory usage: 1.4+ MB


In [30]:
#mapping binary output label to numeric values 0 (fake review) and 1 (real review)
df['target'] = pd.factorize(df['LABEL'])[0]

df.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT,target
0,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav...",0
1,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...,0
2,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...,0
3,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...,0
4,5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...,0


In [31]:
num_fake = len(df[df['target'] == 0])
num_real = len(df[df['target'] == 1])

print(num_real, num_fake)

10500 10500


As seen above, the dataset is evenly balanced across both classes.

# Review Text Preprocessing

In [32]:
tokenizer = RegexpTokenizer(r'\w+')

# converting to lowercase and tokenizing
review_tokens = [tokenizer.tokenize(review.lower()) for review in df['REVIEW_TEXT']]

#removing special characters
review_tokens = [[unidecode(token) for token in review if token.isalnum()] for review in review_tokens]
review_tokens[2]

['i',
 'purchased',
 'this',
 'swing',
 'for',
 'my',
 'baby',
 'she',
 'is',
 '6',
 'months',
 'now',
 'and',
 'has',
 'pretty',
 'much',
 'out',
 'grown',
 'it',
 'it',
 'is',
 'very',
 'loud',
 'and',
 'doesn',
 't',
 'swing',
 'very',
 'well',
 'it',
 'is',
 'beautiful',
 'though',
 'i',
 'love',
 'the',
 'colors',
 'and',
 'it',
 'has',
 'a',
 'lot',
 'of',
 'settings',
 'but',
 'i',
 'don',
 't',
 'think',
 'it',
 'was',
 'worth',
 'the',
 'money']

# Emotion Representation

### Polarity-Based Emotion Representation using OpinionFinder 2.0

In [None]:
# DO NOT RUN THIS

# mounting reviews into individual files for OpinionFinder, creating 10 batches

# parent_dir = "database/docs/amazon_reviews/"
# f_count = 1
# count = 0
# doclist = "amazon_reviews_" + str(f_count) + ".doclist"
# f2 = open(doclist, "a")

# for i in range(len(review_tokens)):
#     fname = parent_dir + "rev_id_" + str(i + 1)
#     fp = open(fname, 'w')
#     review_text = ' '.join(review_tokens[i])
#     fp.write(review_text)
#     fp.close()
    
#     if count == 2100:
#         f2.close()
#         count = 0
#         f_count += 1
        
#         doclist = "amazon_reviews_" + str(f_count) + ".doclist"
#         f2 = open(doclist, "a")
        
#     f2.write(fname+"\n")         
#     count += 1
    
# f2.close()





# commands to execute OpinionFinder2.0

# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_1.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_2.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_3.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_4.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_5.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_6.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_7.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_8.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_9.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_10.doclist -d





# extracting polarity labels from output file (exp_polarity.txt) and adding to dataset

# opinion_finder_pos_count = []
# opinion_finder_neg_count = []

# parent_dir = "database/docs/amazon_reviews/rev_id_"
# suffix = "_auto_anns/exp_polarity.txt"

# for i in range(len(review_tokens)):
#     fpath = parent_dir + str(i + 1) + suffix
#     f = open(fpath, "r")
#     content = f.read()
#     f.close()
    
#     opinion_finder_pos_count.append(content.count("positive"))
#     opinion_finder_neg_count.append(content.count("negative"))
    

# df['OPI_FIN_POS'] = opinion_finder_pos_count
# df['OPI_FIN_NEG'] = opinion_finder_neg_count
# df.to_csv("amazon_reviews_with_polarity.txt", sep = "\t", index = False)

In [None]:
# reference for sentiment analysis with SentiWordNet
# don't  run

# import nltk
# nltk.download('sentiwordnet')
# nltk.download('wordnet')
# from nltk.corpus import wordnet as wn
# from nltk.corpus import sentiwordnet as swn
# list(swn.senti_synsets('slow'))

# sentence='It was a really good day'
# from nltk.tag import pos_tag
# token = nltk.word_tokenize(sentence)
# after_tagging = nltk.pos_tag(token)
# print (token)
# print (after_tagging)
# def penn_to_wn(tag):
#     """
#     Convert between the PennTreebank tags to simple Wordnet tags
#     """
#     if tag.startswith('J'):
#         return wn.ADJ
#     elif tag.startswith('N'):
#         return wn.NOUN
#     elif tag.startswith('R'):
#         return wn.ADV
#     elif tag.startswith('V'):
#         return wn.VERB
#     return None
# sentiment = 0.0
# tokens_count = 0
# from nltk.stem import WordNetLemmatizer
# lemmatizer = WordNetLemmatizer()
# for word, tag in after_tagging:
#             wn_tag = penn_to_wn(tag)
#             if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
#                 continue

#             lemma = lemmatizer.lemmatize(word, pos=wn_tag)
#             if not lemma:
#                 continue

#             synsets = wn.synsets(lemma, pos=wn_tag)
#             if not synsets:
#                 continue

#             # Take the first sense, the most common
#             synset = synsets[0]
#             swn_synset = swn.senti_synset(synset.name())
#             print(swn_synset)

#             sentiment += swn_synset.pos_score() - swn_synset.neg_score()
#             tokens_count += 1

# print (sentiment)

# Stop Word Removal

In [33]:
# removing stop words
# import nltk
# nltk.download('stopwords')

stop_words = set(stopwords.words("english"))
content_review_tokens = [[token for token in review if token not in stop_words and token.isalnum()] for review in review_tokens]

print("Before stop word removal: ", review_tokens[6914])
print()
print("After stop word removal: ", content_review_tokens[6914])

Before stop word removal:  ['love', 'the', 'bottle', 'very', 'much', 'br', 'br', 'iVm', 'a', 'tea', 'lover', 'when', 'i', 'saw', 'this', 'bottle', 'i', 'knew', 'that', 'it', 'was', 'what', 'i', 'wanted', 'the', 'shape', 'is', 'fantastic', 'feels', 'nice', 'in', 'your', 'hand', 'perfect', 'size', 'to', 'have', 'in', 'my', 'car', 'i', 'took', 'it', 'all', 'around', 'so', 'i', 'can', 'enjoy', 'my', 'tea', 'everywhere', 'love', 'it', 'very', 'much', 'the', 'one', 'with', 'infuser', 'also', 'looks', 'good']

After stop word removal:  ['love', 'bottle', 'much', 'br', 'br', 'iVm', 'tea', 'lover', 'saw', 'bottle', 'knew', 'wanted', 'shape', 'fantastic', 'feels', 'nice', 'hand', 'perfect', 'size', 'car', 'took', 'around', 'enjoy', 'tea', 'everywhere', 'love', 'much', 'one', 'infuser', 'also', 'looks', 'good']


In [None]:
# from nltk.stem import SnowballStemmer     #porter 2 algorithm
# snowball = SnowballStemmer(language = "english")

# content_review_tokens = [[snowball.stem(token) for token in review] for review in content_review_tokens]
# print(content_review_tokens[374])

In [None]:
# from nltk.stem import WordNetLemmatizer
# lemmatizer = WordNetLemmatizer()
# content_review_tokens = [[lemmatizer.lemmatize(token) for token in review] for review in content_review_tokens]
# print(content_review_tokens[374])

# N-Gram Modelling

In [None]:
from nltk import ngrams

review_text_unigrams = [list(ngrams(tokens, 1)) for tokens in content_review_tokens]
review_text_bigrams = [list(ngrams(tokens, 2)) for tokens in content_review_tokens]
review_text_trigrams = [list(ngrams(tokens, 3)) for tokens in content_review_tokens]

print(review_text_unigrams[374])
print(review_text_bigrams[374])
print(review_text_trigrams[374])

In [None]:
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Train the Skip-Gram model
vector_size = 100  # Dimensionality of word embeddings
window_size = 5    # Maximum distance between the current and predicted word within a sentence
min_count = 1      # Minimum frequency count of words to consider when training the model
workers = 4        # Number of threads to use while training

# Train the Skip-Gram model
skipgram_model = Word2Vec(sentences=content_review_tokens,
                          vector_size=vector_size,
                          window=window_size,
                          min_count=min_count,
                          workers=workers,
                          sg=1)  # sg=1 specifies Skip-Gram model

# Save the trained word embeddings
skipgram_model.save('skipgram_word_embeddings.model')

In [None]:
# 2. Model Architecture Design:

# DFFNN Model:
# Design a multilayer perceptron neural network with two hidden layers.
# Determine the input layer size based on the features extracted in data preprocessing (e.g., 2000 n-grams, 30 emotion features, and word embeddings).
# Define the number of neurons in each hidden layer based on a grid search procedure.
# Choose rectified linear units as the activation function for the hidden layers.
# Implement dropout regularization to prevent overfitting.
# Utilize softmax activation in the output layer for binary classification (fake/truthful).


# DFFNN Model

In [None]:
#DFFNN Model:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Assuming you have the features extracted in data preprocessing stored in X
# and the binary labels in y (0 for fake, 1 for truthful)

# Input layer size based on features
input_layer_size = len(df.columns)  # Adjust based on the actual number of features

# Hyperparameters
hidden_layer1_neurons = 128
hidden_layer2_neurons = 64
dropout_rate = 0.5  # Adjust as needed

# Define the DFFNN model
model = Sequential()

# Input layer
model.add(Dense(hidden_layer1_neurons, input_dim=input_layer_size, activation='relu'))
model.add(Dropout(dropout_rate))

# First hidden layer
model.add(Dense(hidden_layer2_neurons, activation='relu'))
model.add(Dropout(dropout_rate))

# Second hidden layer
model.add(Dense(hidden_layer2_neurons, activation='relu'))
model.add(Dropout(dropout_rate))

# Output layer (binary classification with softmax activation)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()


# CNN Model

In [None]:

# CNN Model:
# Design a convolutional neural network architecture.
# Convert each sentence into a k-dimensional word representation using pre-trained word embeddings.
# Concatenate word representations to obtain fixed-size input.
# Define the number of filters in the convolutional layer and the size of the filter.
# Utilize rectified linear units as the activation function for the convolutional layer.
# Implement max pooling to downsample the feature maps.
# Use softmax activation in the output layer for binary classification.

In [None]:
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.models import Sequential
from collections import Counter
import pandas as pd

# Hyperparameters
embedding_dim = 100  # Dimensionality of word embeddings
max_len = 100  # Maximum sequence length (number of words in a review)
num_filters = 128  # Number of filters in the convolutional layer
filter_size = 5  # Size of the filter window


# Define the CNN model
model = Sequential()
vocab_size=1000

# Embedding layer
model.add(Embedding(vocab_size, embedding_dim, input_shape=(max_len,)))

# Convolutional layer
model.add(Conv1D(filters=num_filters, kernel_size=filter_size, activation='relu'))

# Max pooling layer
model.add(MaxPooling1D(pool_size=2))

# Global max pooling layer
model.add(GlobalMaxPooling1D())

# Dense layer
model.add(Dense(units=64, activation='relu'))

# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()


In [None]:
# 3. Model Training:
# Split the dataset into training, validation, and testing sets.
# Use mini-batch gradient descent for training the DFFNN model.
# Apply stochastic gradient descent for training the CNN model.
# Tune hyperparameters such as learning rate, dropout rate, and number of iterations using validation set performance.
# Monitor training progress and adjust hyperparameters as needed to prevent overfitting.

In [None]:
#I'm trying idk

In [None]:
# 4. Evaluation:
# Evaluate the trained models on the test set to measure their performance.
# Compute metrics such as accuracy, precision, recall, and F1-score to assess the models' effectiveness in detecting fake reviews.
# Compare the performance of the DFFNN and CNN models with baseline methods and state-of-the-art approaches mentioned in the paper.


In [None]:
# 5. Optimization and Fine-tuning:
# Experiment with different model architectures, hyperparameters, and training strategies to improve performance.
# Consider techniques such as ensemble learning or transfer learning to further enhance model accuracy.
# Fine-tune the models based on insights gained from initial evaluations and analyses.
# By following these steps, you can implement the proposed DFFNN and CNN models for fake review detection based on the ideas presented in the paper. Remember to document your process thoroughly and validate your results to ensure the reliability and reproducibility of your findings.