In [None]:
# 1. Data Preprocessing:
# Collect a dataset of consumer reviews labeled as fake or truthful.
# Preprocess the text data by removing stop words, special symbols, and lowercasing the text.
# Extract emotion features from the reviews using lexicon-based methods.
# Tokenize the text into unigrams, bigrams, and trigrams.
# Calculate tf.idf weights for the n-grams.
# Pre-train word embeddings using the Skip-Gram model on a large corpus of text data (e.g., Amazon reviews).

In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from unidecode import unidecode
from nltk import ngrams
import numpy as np
import torch

In [2]:
# The reviews are labelled as fake or real (in the dataset they’re mapped fake (label1) or real (label2)).
# https://medium.com/@lievgarcia/deception-on-amazon-c1e30d977cfd

df = pd.read_csv("amazon_reviews_features.txt", sep = "\t")   
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 39 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   DOC_ID                21000 non-null  int64  
 1   RATING                21000 non-null  int64  
 2   VERIFIED_PURCHASE     21000 non-null  int64  
 3   PRODUCT_CATEGORY      21000 non-null  object 
 4   PRODUCT_ID            21000 non-null  object 
 5   PRODUCT_TITLE         21000 non-null  object 
 6   REVIEW_TITLE          21000 non-null  object 
 7   REVIEW_TEXT           21000 non-null  object 
 8   OPI_FIN_POS           21000 non-null  int64  
 9   OPI_FIN_NEG           21000 non-null  int64  
 10  BL_POS                21000 non-null  int64  
 11  BL_NEG                21000 non-null  int64  
 12  AFINN_POS             21000 non-null  float64
 13  AFINN_NEG             21000 non-null  float64
 14  S140_POS              21000 non-null  float64
 15  S140_NEG           

In [3]:
#mapping binary output label to numeric values 0 (fake review) and 1 (real review)

# df['TARGET'] = pd.factorize(df['LABEL'])[0]
# df['VERIFIED_PURCHASE'] = pd.factorize(df['VERIFIED_PURCHASE'])[0]   #Y -> 1, N -> 0

# df.drop(["target", "LABEL"], inplace = True, axis = 1)
# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

df.head(30)

Unnamed: 0,DOC_ID,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT,OPI_FIN_POS,OPI_FIN_NEG,...,NRC_TRUST,NRC_EXP_ANGER,NRC_EXP_ANTICIPATION,NRC_EXP_DISGUST,NRC_EXP_FEAR,NRC_EXP_JOY,NRC_EXP_SADNESS,NRC_EXP_SURPRISE,NRC_EXP_TRUST,TARGET
0,1,4,0,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav...",1,0,...,1,0.278005,0.751573,0.747909,1.744297,0.865995,0.314947,0.64172,0.895334,0
1,2,4,1,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...,0,0,...,0,1.352953,1.342698,2.456664,4.800844,1.938679,1.385059,0.863584,3.041116,0
2,3,3,0,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...,3,9,...,3,0.549796,0.950917,1.655262,3.158048,1.485825,0.834764,0.803053,1.717241,0
3,4,4,0,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...,0,0,...,1,0.425796,0.865427,1.2655,2.410741,0.934545,0.731801,0.654469,1.207135,0
4,5,4,0,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...,1,0,...,0,0.803461,1.77763,2.727954,4.902292,2.320584,1.053444,1.218491,2.612396,0
5,6,3,0,Health & Personal Care,B00686HNUK,Tobacco Pipe Stand - Fold-away Portable - Ligh...,not sure,I'm not sure what this is supposed to be but I...,1,1,...,1,0.454425,0.956854,1.486316,3.011503,1.378531,0.767059,0.676561,1.370321,0
6,7,4,0,Toys,B00NUG865W,ESPN 2-Piece Table Tennis,PING PONG TABLE GREAT FOR YOUTHS AND FAMILY,Pleased with ping pong table. 11 year old and ...,1,0,...,2,0.534051,0.973214,1.283938,2.636138,2.005626,0.822033,0.778795,1.413699,0
7,8,4,1,Beauty,B00QUL8VX6,Abundant Health 25% Vitamin C Serum with Vitam...,Great vitamin C serum,Great vitamin C serum... I really like the oil...,1,3,...,2,0.620149,0.844568,1.557884,2.430153,1.475274,0.690926,0.615871,1.065018,0
8,9,4,0,Health & Personal Care,B004YHKVCM,PODS Spring Meadow HE Turbo Laundry Detergent ...,wonderful detergent.,I've used tide pods laundry detergent for many...,1,0,...,0,0.18843,0.760328,0.602009,0.995714,1.831817,0.328094,0.450988,1.043719,0
9,10,1,0,Health & Personal Care,B00H4IBD0M,"Sheer TEST, Best Testosterone Booster Suppleme...",WARNING: do not waste your money on this,Everybody wants to fall for their promises. Bu...,2,1,...,4,0.712392,1.183724,1.634331,2.929736,1.864249,0.834775,0.854013,2.268477,0


In [4]:
num_fake = len(df[df['TARGET'] == 0])
num_real = len(df[df['TARGET'] == 1])

print(num_real, num_fake)

10500 10500


As seen above, the dataset is evenly balanced across both classes.

# Review Text Preprocessing

In [7]:
tokenizer = RegexpTokenizer(r'\w+')

# converting to lowercase and tokenizing
review_tokens = [tokenizer.tokenize(review.lower()) for review in df['REVIEW_TEXT']]

#removing special characters
review_tokens = [[unidecode(token) for token in review if token.isalnum()] for review in review_tokens]
" ".join(review_tokens[0])

'when least you think so this product will save the day just keep it around just in case you need it for something'

# Emotion Representation

### Polarity: OpinionFinder 2.0

In [9]:
# DO NOT RUN THIS

# OpinionFinder2.0: Tags words with polarity (pos/neg)
# Used t o develop two features: OPI_FIN_POS and OPI_FIN_NEG
# defined as the number of words that corresponding to each polarity respectively, per review

# parent_dir = "database/docs/amazon_reviews/"
# f_count = 1
# count = 0
# doclist = "amazon_reviews_" + str(f_count) + ".doclist"
# f2 = open(doclist, "a")

# for i in range(len(review_tokens)):
#     fname = parent_dir + "rev_id_" + str(i + 1)
#     fp = open(fname, 'w')
#     review_text = ' '.join(review_tokens[i])
#     fp.write(review_text)
#     fp.close()
    
#     if count == 2100:
#         f2.close()
#         count = 0
#         f_count += 1
        
#         doclist = "amazon_reviews_" + str(f_count) + ".doclist"
#         f2 = open(doclist, "a")
        
#     f2.write(fname+"\n")         
#     count += 1
    
# f2.close()





# commands to execute OpinionFinder2.0

# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_1.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_2.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_3.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_4.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_5.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_6.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_7.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_8.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_9.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_10.doclist -d





# extracting polarity labels from output file (exp_polarity.txt) and adding to dataset

# opinion_finder_pos_count = []
# opinion_finder_neg_count = []

# parent_dir = "database/docs/amazon_reviews/rev_id_"
# suffix = "_auto_anns/exp_polarity.txt"

# for i in range(len(review_tokens)):
#     fpath = parent_dir + str(i + 1) + suffix
#     f = open(fpath, "r")
#     content = f.read()
#     f.close()
    
#     opinion_finder_pos_count.append(content.count("positive"))
#     opinion_finder_neg_count.append(content.count("negative"))
    

# df['OPI_FIN_POS'] = opinion_finder_pos_count
# df['OPI_FIN_NEG'] = opinion_finder_neg_count
# df.to_csv("amazon_reviews_with_polarity.txt", sep = "\t", index = False)

### Polarity: Bing Liu's Lexicon

In [10]:
# DO NOT RUN THIS

# Bing Liu et al: Opinion Lexicon for positive and negative polarity tagging of words
# Used to develop 2 features: BL_POS and BL_NEG
# defined as the number of words that corresponding to each polarity respectively, per review

# dir_name = "../bing-liu-opinion-lexicon-English/"
# pos_file = dir_name + "positive-words.txt"
# neg_file = dir_name + "negative-words.txt"

# f1 = open(pos_file, "r")
# f2 = open(neg_file, "r")

# pos_lexicon = f1.read()
# neg_lexicon = f2.read()

# f1.close()
# f2.close()

# bl_pos = []
# bl_neg = []

# for review in review_tokens:
#     count_pos = 0
#     count_neg = 0
    
#     for token in review:
#         if token in pos_lexicon:
#             count_pos += 1
#         if token in neg_lexicon:
#             count_neg += 1
            
#     bl_pos.append(count_pos)
#     bl_neg.append(count_neg)
    
# print(bl_pos[:15])
# print(bl_neg[:15])

# df['BL_POS'] = bl_pos
# df['BL_NEG'] = bl_neg

# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

### Strength Score: AFINN

In [11]:
# #!pip install afinn

# # AFINN: Sentiment lexicon for measuring the positive and negative score of a review
# # Used to develop 2 features: BL_POS and BL_NEG

# from afinn import Afinn
# afn = Afinn()

# afinn_pos = []
# afinn_neg = []

# for review in review_tokens:
#     review = " ".join(review)
#     s = afn.score(review)
    
#     if s > 0:
#         afinn_pos.append(s)
#         afinn_neg.append(0.0)
        
#     else:
#         afinn_pos.append(0.0)
#         afinn_neg.append(-1 * s)
        
# print(afinn_pos[:15])
# print(afinn_neg[:15])

In [12]:
# df['AFINN_POS'] = afinn_pos
# df['AFINN_NEG'] = afinn_neg
# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

### Strength Score: Sentiment140

In [13]:
# Sentiment140: Lexicon for measuring the positive and negative score of a unigram/bigram
# Used to develop 2 features: S140_POS and S140_NEG

# import csv

# dir_name = "../Sentiment140-Lexicon/"
# f1 = "unigrams-pmilexicon.txt"
# f2 = "bigrams-pmilexicon.txt"


# uni_lex = pd.read_csv(dir_name + f1, sep = "\t")
# bi_lex = pd.read_csv(dir_name + f2, sep = "\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

# uni_dict = dict(zip(uni_lex["term"], uni_lex["score"]))
# bi_dict = dict(zip(bi_lex["term"], bi_lex["score"]))

# s140_pos = []
# s140_neg = []

# for review in review_tokens:
#     score = 0
#     uni_score = 0
#     uni_c = 0
        
#     for unigram in review:
#         if unigram in uni_dict:
#             uni_score += uni_dict[unigram]
#             uni_c += 1
    
#     if uni_c > 0:
#         uni_score /= uni_c
    
    
#     bi_score = 0
#     bi_c = 0
    
#     bigrams = list(ngrams(review, 2))
#     for bigram in bigrams:
#         text = " ".join(bigram)
#         if text in bi_dict:
#             bi_score += bi_dict[text]
#             bi_c += 1
    
#     if bi_c > 0:
#         bi_score /= bi_c
    
    
#     score = (bi_score + uni_score) / (int(uni_c > 0) + int(bi_c > 0))
    
#     if score > 0:
#         s140_pos.append(round(score, 5))
#         s140_neg.append(0.0)
        
#     else:
#         s140_pos.append(0.0)
#         s140_neg.append(round(-1 * score, 5))
    
    
# print(s140_pos[:15])
# print(s140_neg[:15])

# print(s140_pos.count(0.0), s140_neg.count(0.0), len(s140_pos))

In [14]:
# df['S140_POS'] = s140_pos
# df['S140_NEG'] = s140_neg
# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

### Strength Score: SentiWordNet3.0

In [15]:
# SentiWordNet3.0: Lexicon for measuring the positive and negative score of synsets in WordNet 3.0
# Used to develop 2 features: SWN_POS and SWN_NEG
#(for this one, try to make the scores exclusive like the others)


# # import nltk
# # nltk.download('sentiwordnet')

# from nltk.corpus import sentiwordnet as swn

# swn_pos = []
# swn_neg = []

# for review in review_tokens:
#     pos_score = 0
#     neg_score = 0
#     count = 0
    
#     for term in review:
#         res = swn.senti_synsets(term)
        
#         try:
#             res0 = list(res)[0]
#             pos_score += res0.pos_score()
#             neg_score += res0.neg_score()
#             count += 1
            
#         except:
#             pass
    
#     if count > 0:
#         pos_score = pos_score / count
#         neg_score = neg_score / count
        
#     swn_pos.append(pos_score)
#     swn_neg.append(neg_score)

# print(swn_pos[:15])
# print(swn_neg[:15])

In [16]:
# df['SWN_POS'] = swn_pos
# df['SWN_NEG'] = swn_neg
# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

### Strength Score: NRC Hashtag

In [17]:
# NRC Hashtag: Lexicon for measuring the positive and negative score of a unigram/bigram based on twitter hashtags
# Used to develop 2 features: NRC_HASH_POS and NRC_HASH_NEG


# import csv

# dir_name = "../NRC-Hashtag-Sentiment-Lexicon-v0.1/"
# f1 = "unigrams-pmilexicon.txt"
# f2 = "bigrams-pmilexicon.txt"

# uni_lex = pd.read_csv(dir_name + f1, sep = "\t")
# bi_lex = pd.read_csv(dir_name + f2, sep = "\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

# uni_dict = dict(zip(uni_lex["term"], uni_lex["score"]))
# bi_dict = dict(zip(bi_lex["term"], bi_lex["score"]))

# nrc_hash_pos = []
# nrc_hash_neg = []

# for review in review_tokens:
#     score = 0
#     uni_score = 0
#     uni_c = 0
        
#     for unigram in review:
#         if unigram in uni_dict:
#             uni_score += uni_dict[unigram]
#             uni_c += 1
    
#     if uni_c > 0:
#         uni_score /= uni_c
    
    
#     bi_score = 0
#     bi_c = 0
    
#     bigrams = list(ngrams(review, 2))
#     for bigram in bigrams:
#         text = " ".join(bigram)
#         if text in bi_dict:
#             bi_score += bi_dict[text]
#             bi_c += 1
    
#     if bi_c > 0:
#         bi_score /= bi_c
    
    
#     score = (bi_score + uni_score) / (int(uni_c > 0) + int(bi_c > 0))
    
#     if score > 0:
#         nrc_hash_pos.append(round(score, 5))
#         nrc_hash_neg.append(0.0)
        
#     else:
#         nrc_hash_pos.append(0.0)
#         nrc_hash_neg.append(round(-1 * score, 5))
    
    
# print(nrc_hash_pos[:15])
# print(nrc_hash_neg[:15])

In [18]:
# df['NRC_HASH_POS'] = nrc_hash_pos
# df['NRC_HASH_NEG'] = nrc_hash_neg
# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

### Strength Score: Emoticon - Based Lexicon

In [19]:
# # Emoticon Based Lexicon: Lexicon for measuring the positive and negative score of a word based on co-oocurence with emoticons
# # Used to develop 2 features: NRC_HASH_POS and NRC_HASH_NEG


# import csv

# dir_name = "../references_and_lexicons/ijcai-kbs_emoticon/"
# f1 = "STS_OR.csv"

# lex = pd.read_csv(dir_name + f1, sep = "\t")

# lex['word'] = lex['word'].str.split('-').str[1]

# my_dict = dict([(i,(a,b)) for i, a, b in zip(lex['word'], lex['positive'], lex['negative'])])

# emoticon_pos = []
# emoticon_neg = []

# for review in review_tokens:
#     pos_score = 0
#     neg_score = 0
#     count = 0
        
#     for word in review:
#         if word in my_dict:
#             pos_score += my_dict[word][0]
#             neg_score += my_dict[word][1]
#             count += 1    
      
#     if count > 0:
#         pos_score = round(pos_score / count, 5)
#         neg_score = round(neg_score / count, 5)
        
#     emoticon_pos.append(pos_score)
#     emoticon_neg.append(neg_score)

# print(emoticon_pos[:15])
# print(emoticon_neg[:15])

In [20]:
# df['EMOTICON_POS'] = emoticon_pos
# df['EMOTICON_NEG'] = emoticon_neg
# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

### NRCLex: Emotion - Based Lexicon

In [36]:
# # !pip install NRCLex
# from nrclex import NRCLex

# my_dict = {"NRC_ANGER": [], 
#            "NRC_ANTICIPATION": [],
#            "NRC_DISGUST": [], 
#            "NRC_FEAR": [], 
#            "NRC_JOY": [], 
#            "NRC_SADNESS": [], 
#            "NRC_SURPRISE": [],
#            "NRC_TRUST": []}



# for review in review_tokens:
#     text = " ".join(review)
#     res = NRCLex(text)
#     emotion_scores = res.raw_emotion_scores
        
#     for key in my_dict:
#         emotion =  key[4:].lower()
#         if emotion in emotion_scores:
#             my_dict[key].append(emotion_scores[emotion])
#         else:
#             my_dict[key].append(0)
            
# df2 = pd.DataFrame.from_dict(my_dict)
# df = pd.concat([df, df2], axis=1)


# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

### NRCLex Expanded: Emotion - Based Lexicon

In [46]:


# # ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']

# my_dict = {"NRC_EXP_ANGER": [], 
#            "NRC_EXP_ANTICIPATION": [],
#            "NRC_EXP_DISGUST": [], 
#            "NRC_EXP_FEAR": [], 
#            "NRC_EXP_JOY": [], 
#            "NRC_EXP_SADNESS": [], 
#            "NRC_EXP_SURPRISE": [],
#            "NRC_EXP_TRUST": []}


# dir_name = "../references_and_lexicons/emo_lex_expanded/"
# f1 = "w2v-dp-CC-Lex.csv"

# lex = pd.read_csv(dir_name + f1, sep = "\t")

# lex = dict([(i,(a,b,c,d,e,f,g,h)) for i, a, b, c, d, e, f, g, h in zip(lex['word'], lex['anger'], lex['anticipation'], lex['disgust'], lex['fear'], lex['joy'], lex['sadness'], lex['surprise'], lex['trust'])])


# for review in review_tokens:
    
#     for key in my_dict:
#         my_dict[key].append(0)
    
#     for word in review:
        
#         if word in lex:
#             i = 0
#             for key in my_dict:
#                 my_dict[key][-1] += lex[word][i]
#                 i += 1
        
        
# for key in my_dict:
#     print(my_dict[key][:3])


[0.2780046027161847, 1.3529529085431935, 0.5497961440348219]
[0.7515731018557517, 1.3426978241954444, 0.9509167526417303]
[0.7479090741959875, 2.4566635439257225, 1.6552617575405246]
[1.7442972519267799, 4.800843659469453, 3.1580480013120606]
[0.8659946371977526, 1.9386790415546618, 1.4858246032393256]
[0.3149466767086976, 1.385059438063274, 0.8347636579443728]
[0.6417195280917016, 0.8635843619883152, 0.8030526443970994]
[0.8953342091683308, 3.041116462595752, 1.7172406314417379]


In [47]:
# df2 = pd.DataFrame.from_dict(my_dict)
# df = pd.concat([df, df2], axis=1)
# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

# Stop Word Removal

In [8]:
# removing stop words
# import nltk
# nltk.download('stopwords')

stop_words = set(stopwords.words("english"))
content_review_tokens = [[token for token in review if token not in stop_words and token.isalnum()] for review in review_tokens]

print("Before stop word removal: ", review_tokens[6914])
print()
print("After stop word removal: ", content_review_tokens[6914])

Before stop word removal:  ['love', 'the', 'bottle', 'very', 'much', 'br', 'br', 'iVm', 'a', 'tea', 'lover', 'when', 'i', 'saw', 'this', 'bottle', 'i', 'knew', 'that', 'it', 'was', 'what', 'i', 'wanted', 'the', 'shape', 'is', 'fantastic', 'feels', 'nice', 'in', 'your', 'hand', 'perfect', 'size', 'to', 'have', 'in', 'my', 'car', 'i', 'took', 'it', 'all', 'around', 'so', 'i', 'can', 'enjoy', 'my', 'tea', 'everywhere', 'love', 'it', 'very', 'much', 'the', 'one', 'with', 'infuser', 'also', 'looks', 'good']

After stop word removal:  ['love', 'bottle', 'much', 'br', 'br', 'iVm', 'tea', 'lover', 'saw', 'bottle', 'knew', 'wanted', 'shape', 'fantastic', 'feels', 'nice', 'hand', 'perfect', 'size', 'car', 'took', 'around', 'enjoy', 'tea', 'everywhere', 'love', 'much', 'one', 'infuser', 'also', 'looks', 'good']


In [None]:
# STEMMING AND LEMMATIZATION IS NOT REQUIRED I THINK, BECAUSE THE PAPER DOES NOT MENTION IT

# from nltk.stem import SnowballStemmer     #porter 2 algorithm
# snowball = SnowballStemmer(language = "english")
# content_review_tokens = [[snowball.stem(token) for token in review] for review in content_review_tokens]
# print(content_review_tokens[374])

# from nltk.stem import WordNetLemmatizer
# lemmatizer = WordNetLemmatizer()
# content_review_tokens = [[lemmatizer.lemmatize(token) for token in review] for review in content_review_tokens]
# print(content_review_tokens[374])

# N-Gram Modelling

In [9]:
review_text_unigrams = [list(ngrams(tokens, 1)) for tokens in content_review_tokens]
review_text_bigrams = [list(ngrams(tokens, 2)) for tokens in content_review_tokens]
review_text_trigrams = [list(ngrams(tokens, 3)) for tokens in content_review_tokens]

print(review_text_unigrams[374])
print(review_text_bigrams[374])
print(review_text_trigrams[374])

[('brushes',), ('soft',), ('soon',), ('first',), ('usage',), ('see',), ('bristles',), ('coming',), ('worth',), ('purchase',), ('falling',), ('generic',), ('product',)]
[('brushes', 'soft'), ('soft', 'soon'), ('soon', 'first'), ('first', 'usage'), ('usage', 'see'), ('see', 'bristles'), ('bristles', 'coming'), ('coming', 'worth'), ('worth', 'purchase'), ('purchase', 'falling'), ('falling', 'generic'), ('generic', 'product')]
[('brushes', 'soft', 'soon'), ('soft', 'soon', 'first'), ('soon', 'first', 'usage'), ('first', 'usage', 'see'), ('usage', 'see', 'bristles'), ('see', 'bristles', 'coming'), ('bristles', 'coming', 'worth'), ('coming', 'worth', 'purchase'), ('worth', 'purchase', 'falling'), ('purchase', 'falling', 'generic'), ('falling', 'generic', 'product')]


# Feature 1 of the dffnn model, top 2000 n grams according to their tfidf weights


In [118]:
#feature 1 of the dffnn model, top 2000 n grams according to their tfidf weights

from sklearn.feature_extraction.text import TfidfVectorizer

# Convert tokenized reviews back to strings
cleaned_reviews = [" ".join(review) for review in content_review_tokens]

# TF-IDF Vectorization
# Step 1: Tokenize into unigrams, bigrams, and trigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_reviews)


# Step 2: Calculate tf-idf weights for each n-gram
tfidf_weights = tfidf_matrix.toarray()  # Convert the sparse matrix to dense array
feature_names = tfidf_vectorizer.get_feature_names_out()  # Get the feature names (n-grams)

# Now you can access the TF-IDF weights for each n-gram
# for i, review in enumerate(cleaned_reviews):
#     print(f"TF-IDF weights for review {i+1}:")
#     for j, feature in enumerate(feature_names):
#         weight = tfidf_weights[i][j]
#         if weight > 0:  # To print only non-zero weights
#             print(f"{feature}: {weight}")
#     print("\n")


# Step 3: Select top 2000 n-grams based on their tf-idf scores
total_tfidf_scores = np.sum(tfidf_weights, axis=0)  # Sum TF-IDF scores across all documents
top_indices = total_tfidf_scores.argsort()[-2000:][::-1]  # Get indices of top 2000 scores in descending order

# Get the top 2000 n-grams and their corresponding TF-IDF scores
top_ngrams = [feature_names[i] for i in top_indices]
top_tfidf_scores = [total_tfidf_scores[i] for i in top_indices]

# Print the top 2000 n-grams and their TF-IDF scores
for i, (ngram, score) in enumerate(zip(top_ngrams, top_tfidf_scores), start=1):
    print(f"{i}. {ngram}: {score}")


1. br: 683.5375098455908
2. great: 485.67385779350764
3. one: 405.3027978587271
4. good: 404.02605479916946
5. like: 379.3174868725596
6. product: 339.8697354279675
7. love: 328.7302710917243
8. really: 328.21438431499655
9. use: 319.57330483986317
10. well: 316.95741739486925
11. would: 311.9556270334749
12. quality: 282.7200529018454
13. br br: 264.85096957259157
14. get: 263.62894814916916
15. time: 258.1832341503777
16. price: 256.3732679193606
17. easy: 240.8427188899182
18. bought: 239.4415912565312
19. nice: 230.6021592528332
20. much: 226.78623763812826
21. little: 220.82492781588758
22. 34: 213.82089672639142
23. also: 212.64020052500717
24. works: 207.00762473970457
25. recommend: 204.99685522715518
26. got: 204.78086642991764
27. work: 200.53023332237706
28. buy: 198.64007161997776
29. perfect: 189.91220569507732
30. used: 186.71283986113454
31. even: 183.07758287625373
32. made: 181.12588697644114
33. watch: 173.43014255462052
34. looks: 170.94993808906568
35. better: 168.5

In [119]:
# Matrix creation for 21000 reviews for top 2000 ngrams

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

def review_to_vector(review, top_ngrams, top_tfidf_scores):
    review_vector = np.zeros(len(top_ngrams))  # Initialize a vector for the review

    for ngram, weight in zip(top_ngrams, top_tfidf_scores):
        if ngram in review:  # Check if the n-gram is present in the review
            index = top_ngrams.index(ngram)  # Get the index of the n-gram in the top_ngrams list
            review_vector[index] = weight  # Assign the TF-IDF weight to the corresponding index in the review vector

    return review_vector

# Example: Convert each review to a vector representation
review_vectors = []
for review in cleaned_reviews:
    review_vector = review_to_vector(review, top_ngrams, top_tfidf_scores)
    review_vectors.append(review_vector)

# Convert the list of review vectors to a numpy array
X = np.array(review_vectors)

In [15]:
# Reducing Dimensionality from 2000 to 200 per review

#df['target'] = pd.factorize(df['LABEL'])[0]
y = df['TARGET']

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.decomposition import TruncatedSVD

# Step 1: Correlation-based Feature Selection (CBFS)
def correlation_based_feature_selection(X, y, k):
    selector = SelectKBest(score_func=f_classif, k=k)
    X_selected = selector.fit_transform(X, y)
    return X_selected

# Step 2: Latent Semantic Analysis (LSA)
def latent_semantic_analysis(X, n_components):
    lsa = TruncatedSVD(n_components=n_components, random_state=42)
    X_lsa = lsa.fit_transform(X)
    return X_lsa

# Feature selection and dimensionality reduction for ngrams
X_ngrams_selected = correlation_based_feature_selection(X, y, 500)
X_ngrams_lsa = latent_semantic_analysis(X_ngrams_selected, 200)



In [120]:
X_ngrams_lsa.shape
#print(type(X_ngrams_lsa))

(21000, 200)

In [121]:
print(X_ngrams_lsa[0])

[ 4.73868151e+01  1.02768647e+02  7.26610315e+01  3.03809716e+02
 -8.27915142e+00  4.66456015e+01  2.66001320e+01 -4.86280742e+01
  7.44254124e+00 -1.97831231e+01  1.55038016e+01  2.38028925e+01
  2.64646972e+00 -2.47231922e+01  1.52000920e+01  5.19867970e+01
 -4.51044493e+01 -1.09908445e+02  4.22393243e+01  1.38180121e+01
 -1.48242113e+01  3.87889349e+00 -1.54361337e+00 -2.17958403e+00
  8.38948383e+00 -2.32563276e+00 -1.37217838e+01  2.30804660e+00
 -2.08863990e+00 -5.34636382e+00  3.80086828e+00  3.32903230e-01
  2.12670472e+00 -7.02054175e-01 -1.23659241e+01  5.00477979e+00
  1.28588303e+00  5.14642744e+00 -1.62744446e+01  3.97170481e+01
 -1.18965627e+00  3.78013943e+01  9.05320202e+01  3.77076532e+01
 -1.16015049e+01  1.87516440e+00  4.68109218e+01  7.55332573e+01
  4.51990124e+00  2.00977422e+01 -3.00485867e+01 -9.00303398e+00
 -2.15325590e+01 -4.50268879e+00 -3.01781108e+00 -1.65283608e+01
 -1.35584343e+00 -9.03690852e+00  2.74770153e+00  3.69696636e+00
 -3.87429242e+00 -1.51951

In [14]:
# import pandas as pd
# from gensim.models import Word2Vec
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords

# # Train the Skip-Gram model
# vector_size = 100  # Dimensionality of word embeddings
# window_size = 5    # Maximum distance between the current and predicted word within a sentence
# min_count = 1      # Minimum frequency count of words to consider when training the model
# workers = 4        # Number of threads to use while training

# # Train the Skip-Gram model
# skipgram_model = Word2Vec(sentences=content_review_tokens,
#                           vector_size=vector_size,
#                           window=window_size,
#                           min_count=min_count,
#                           workers=workers,
#                           sg=1)  # sg=1 specifies Skip-Gram model

# # Save the trained word embeddings
# # skipgram_model.save('skipgram_word_embeddings.model')

# #skipgram_model.train([["hello", "world"]], total_examples=1, epochs=1)





In [123]:
# Assuming the Skip-Gram model is trained and saved as 'skipgram_word_embeddings.model'

from gensim.models import Word2Vec

# Assuming your preprocessed tokens are in 'content_review_tokens'
skipgram_model = Word2Vec(sentences=content_review_tokens,
                          vector_size=100,
                          window=5,
                          min_count=1,
                          sg=1,
                          hs=1)  # Set hs=1 for hierarchical softmax (optional)
#skipgram_model.save('skipgram_word_embeddings.model')

In [124]:
# Function to calculate average word embedding for a review
def average_word_embedding(review, skipgram_model):
  num_words = 0  # Count of valid words in the review

  review_vec = np.zeros(skipgram_model.vector_size)  # Zero vector to accumulate word embeddings

  for token in review:
    try:
      # Get the word embedding vector from the model
      word_vec = skipgram_model.wv[token]
      review_vec += word_vec  # Add the word vector to the accumulator
      num_words += 1
    except KeyError:
      # Skip words not found in the vocabulary (might be rare or out-of-vocabulary words)
      pass

# If no valid words were found, return zero vector
  if num_words == 0:
    return review_vec
  else:
    # Calculate the average word embedding
    return review_vec / num_words

# Example usage: Calculate average embedding for a sample review
sample_review = content_review_tokens[0]  # Assuming content_review_tokens contains tokenized reviews
average_embedding = average_word_embedding(sample_review, skipgram_model)
print(average_embedding)
print(len(average_embedding))

[-0.1791616   0.0524897  -0.05231478 -0.10162009 -0.04017087 -0.56161845
  0.15999307  0.2609156  -0.35660228 -0.13327965 -0.03976102 -0.32671579
  0.16690384  0.01003078  0.00340737 -0.05915942  0.01489905 -0.05257911
 -0.13975781 -0.35406293  0.25526112  0.10253918  0.18216395 -0.14401178
 -0.14487525  0.21098809  0.05421619  0.03644725  0.19357607 -0.05955293
  0.18420431 -0.00749496 -0.10461058  0.20404501 -0.12647992  0.32785753
  0.09219551 -0.27940511  0.01703775 -0.08704697  0.06907846 -0.09821813
 -0.01793839 -0.20797611  0.33020155  0.08121419 -0.05311894  0.01601348
  0.05914304 -0.12217449  0.21691705 -0.10393532 -0.15969188 -0.05581789
 -0.32458127  0.02629361  0.19610698  0.03211286  0.06557599 -0.07393019
  0.0892282  -0.14572393 -0.17350443  0.00917811 -0.30878576  0.23749294
  0.15808294  0.2100823  -0.32689123  0.07468973 -0.16534622 -0.19946373
  0.26375073 -0.21035649 -0.01392641  0.1810166   0.18161253  0.12565595
 -0.28371659 -0.0355084   0.01597047 -0.06202562 -0

In [125]:
avg_embedding_reviews = []
for i in range(0,len(content_review_tokens)):
  sample_review = content_review_tokens[i]
  average_embedding = average_word_embedding(sample_review, skipgram_model)
  avg_embedding_reviews.append(average_embedding)
print(len(avg_embedding_reviews))

21000


In [126]:
#df.columns

emotion_df = df[['VERIFIED_PURCHASE', 'OPI_FIN_POS', 'OPI_FIN_NEG', 'BL_POS', 'BL_NEG', 'AFINN_POS',
       'AFINN_NEG', 'S140_POS', 'S140_NEG', 'SWN_POS', 'SWN_NEG',
       'NRC_HASH_POS', 'NRC_HASH_NEG', 'EMOTICON_POS', 'EMOTICON_NEG',
       'NRC_ANGER', 'NRC_ANTICIPATION', 'NRC_DISGUST', 'NRC_FEAR', 'NRC_JOY',
       'NRC_SADNESS', 'NRC_SURPRISE', 'NRC_TRUST', 'NRC_EXP_ANGER',
       'NRC_EXP_ANTICIPATION', 'NRC_EXP_DISGUST', 'NRC_EXP_FEAR',
       'NRC_EXP_JOY', 'NRC_EXP_SADNESS', 'NRC_EXP_SURPRISE', 'NRC_EXP_TRUST']]

emotion_X = emotion_df.values.tolist()

In [127]:
X = []

for i in range(21000):
    X.append(list(X_ngrams_lsa[i]) + avg_embedding_reviews[i].tolist() + emotion_X[i])
    
print(len(X))
print(len(X[0]))

21000
331


In [128]:
X = np.array(X)
print(type(X))
print(X.shape)

<class 'numpy.ndarray'>
(21000, 331)


In [156]:
X_train = np.concatenate((X[:8400], X[10500:18900]))
X_test = np.concatenate((X[8400:10500], X[18900:]))
y_train = np.concatenate((df['TARGET'].values[:8400], df['TARGET'].values[10500:18900]))
y_test = np.concatenate((df['TARGET'].values[8400:10500], df['TARGET'].values[18900:]))

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(y_test[0])

(16800, 331) (4200, 331) (16800,) (4200,)
0


In [117]:
# unique_values, counts = np.unique(y_test, return_counts=True)
# print(unique_values, counts)

[0 1] [2100 2100]


In [129]:
# X_train = X[:16800]
# X_test = X[16800:]
# y_train = df['TARGET'].values[:16800]
# y_test = df['TARGET'].values[16800:]

# print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# print(y_train[0])

(16800, 331) (4200, 331) (16800,) (4200,)
0


In [None]:
# 2. Model Architecture Design:

# DFFNN Model:
# Design a multilayer perceptron neural network with two hidden layers.
# Determine the input layer size based on the features extracted in data preprocessing (e.g., 2000 n-grams, 30 emotion features, and word embeddings).
# Define the number of neurons in each hidden layer based on a grid search procedure.
# Choose rectified linear units as the activation function for the hidden layers.
# Implement dropout regularization to prevent overfitting.
# Utilize softmax activation in the output layer for binary classification (fake/truthful).


# DFFNN Model

In [114]:
#DFFNN Model:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Assuming you have the features extracted in data preprocessing stored in X
# and the binary labels in y (0 for fake, 1 for truthful)

# Input layer size based on features
input_layer_size = len(X[0])  # Adjust based on the actual number of features

# Hyperparameters
hidden_layer1_neurons = 100
hidden_layer2_neurons = 50
dropout_rate = 0.5  # Adjust as needed

# Define the DFFNN model
model = Sequential()

# Input layer
model.add(Dense(hidden_layer1_neurons, input_dim=input_layer_size, activation='relu'))
model.add(Dropout(0.2))

# First hidden layer
model.add(Dense(hidden_layer2_neurons, activation='relu'))
model.add(Dropout(dropout_rate))

# Second hidden layer
model.add(Dense(hidden_layer2_neurons, activation='relu'))
model.add(Dropout(dropout_rate))

# Output layer (binary classification with softmax activation)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.1), loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 100)               33200     
                                                                 
 dropout_18 (Dropout)        (None, 100)               0         
                                                                 
 dense_25 (Dense)            (None, 50)                5050      
                                                                 
 dropout_19 (Dropout)        (None, 50)                0         
                                                                 
 dense_26 (Dense)            (None, 50)                2550      
                                                                 
 dropout_20 (Dropout)        (None, 50)                0         
                                                                 
 dense_27 (Dense)            (None, 1)                

In [157]:
model.fit(X_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x1c2850badc0>

In [158]:
y_pred = model.predict(X_test)
#y_pred = (int(y_pred>0.25)

for i in range(len(y_pred)):
    if y_pred[i] > 0.4:
        y_pred[i] = 1
    else:
        y_pred[i] = 0
        
        
print(y_pred)

[[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [1.]]


In [159]:
# y_pred.shape

from sklearn.metrics import accuracy_score

print(y_test.tolist().count(1))

accuracy_score(y_test, y_pred)
# unique_values, counts = np.unique(y_pred, return_counts=True)

# print(X_est[:30])

2100


0.5

In [49]:
# X = torch.tensor(X)

#print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# from tensorflow.python.client import device_lib 
# print(device_lib.list_local_devices())
cuda_id = torch.cuda.current_device()
print("Name of the current CUDA Device: ", torch.cuda.get_device_name(cuda_id))
print(cuda_id)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Name of the current CUDA Device:  NVIDIA GeForce MX450
0


In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        #self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
model = NeuralNetwork().to(device)
print(model)

# CNN Model

In [None]:

# CNN Model:
# Design a convolutional neural network architecture.
# Convert each sentence into a k-dimensional word representation using pre-trained word embeddings.
# Concatenate word representations to obtain fixed-size input.
# Define the number of filters in the convolutional layer and the size of the filter.
# Utilize rectified linear units as the activation function for the convolutional layer.
# Implement max pooling to downsample the feature maps.
# Use softmax activation in the output layer for binary classification.

In [None]:
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.models import Sequential
from collections import Counter
import pandas as pd

# Hyperparameters
embedding_dim = 100  # Dimensionality of word embeddings
max_len = 100  # Maximum sequence length (number of words in a review)
num_filters = 128  # Number of filters in the convolutional layer
filter_size = 5  # Size of the filter window


# Define the CNN model
model = Sequential()
vocab_size=1000

# Embedding layer
model.add(Embedding(vocab_size, embedding_dim, input_shape=(max_len,)))

# Convolutional layer
model.add(Conv1D(filters=num_filters, kernel_size=filter_size, activation='relu'))

# Max pooling layer
model.add(MaxPooling1D(pool_size=2))

# Global max pooling layer
model.add(GlobalMaxPooling1D())

# Dense layer
model.add(Dense(units=64, activation='relu'))

# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()


In [None]:
# 3. Model Training:
# Split the dataset into training, validation, and testing sets.
# Use mini-batch gradient descent for training the DFFNN model.
# Apply stochastic gradient descent for training the CNN model.
# Tune hyperparameters such as learning rate, dropout rate, and number of iterations using validation set performance.
# Monitor training progress and adjust hyperparameters as needed to prevent overfitting.

In [None]:
#I'm trying idk

In [None]:
# 4. Evaluation:
# Evaluate the trained models on the test set to measure their performance.
# Compute metrics such as accuracy, precision, recall, and F1-score to assess the models' effectiveness in detecting fake reviews.
# Compare the performance of the DFFNN and CNN models with baseline methods and state-of-the-art approaches mentioned in the paper.


In [None]:
# 5. Optimization and Fine-tuning:
# Experiment with different model architectures, hyperparameters, and training strategies to improve performance.
# Consider techniques such as ensemble learning or transfer learning to further enhance model accuracy.
# Fine-tune the models based on insights gained from initial evaluations and analyses.
# By following these steps, you can implement the proposed DFFNN and CNN models for fake review detection based on the ideas presented in the paper. Remember to document your process thoroughly and validate your results to ensure the reliability and reproducibility of your findings.