In [None]:
# 1. Data Preprocessing:
# Collect a dataset of consumer reviews labeled as fake or truthful.
# Preprocess the text data by removing stop words, special symbols, and lowercasing the text.
# Extract emotion features from the reviews using lexicon-based methods.
# Tokenize the text into unigrams, bigrams, and trigrams.
# Calculate tf.idf weights for the n-grams.
# Pre-train word embeddings using the Skip-Gram model on a large corpus of text data (e.g., Amazon reviews).

In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from unidecode import unidecode
from nltk import ngrams


In [2]:
# The reviews are labelled as fake or real (in the dataset they’re mapped fake (label1) or real (label2)).
# https://medium.com/@lievgarcia/deception-on-amazon-c1e30d977cfd

df = pd.read_csv("amazon_reviews_features.txt", sep = "\t")   
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   DOC_ID             21000 non-null  int64  
 1   LABEL              21000 non-null  object 
 2   RATING             21000 non-null  int64  
 3   VERIFIED_PURCHASE  21000 non-null  object 
 4   PRODUCT_CATEGORY   21000 non-null  object 
 5   PRODUCT_ID         21000 non-null  object 
 6   PRODUCT_TITLE      21000 non-null  object 
 7   REVIEW_TITLE       21000 non-null  object 
 8   REVIEW_TEXT        21000 non-null  object 
 9   target             21000 non-null  int64  
 10  OPI_FIN_POS        21000 non-null  int64  
 11  OPI_FIN_NEG        21000 non-null  int64  
 12  BL_POS             21000 non-null  int64  
 13  BL_NEG             21000 non-null  int64  
 14  AFINN_POS          21000 non-null  float64
 15  AFINN_NEG          21000 non-null  float64
 16  S140_POS           210

In [3]:
#mapping binary output label to numeric values 0 (fake review) and 1 (real review)
df['target'] = pd.factorize(df['LABEL'])[0]

df.head(30)

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT,target,...,AFINN_POS,AFINN_NEG,S140_POS,S140_NEG,SWN_POS,SWN_NEG,NRC_HASH_POS,NRC_HASH_NEG,NRC_EMOTICON_POS,NRC_EMOTICON_NEG
0,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav...",0,...,2.0,0.0,0.21302,0.0,0.09375,0.007812,0.0,0.01281,0.21302,0.0
1,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...,0,...,4.0,0.0,0.0,0.14293,0.036017,0.021186,0.05619,0.0,0.0,0.14293
2,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...,0,...,9.0,0.0,0.0,0.06157,0.107558,0.017442,0.10604,0.0,0.0,0.06157
3,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...,0,...,1.0,0.0,0.0,0.20638,0.03125,0.052083,0.0,0.06995,0.0,0.20638
4,5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...,0,...,5.0,0.0,0.0,0.20035,0.014205,0.014205,0.02408,0.0,0.0,0.20035
5,6,__label1__,3,N,Health & Personal Care,B00686HNUK,Tobacco Pipe Stand - Fold-away Portable - Ligh...,not sure,I'm not sure what this is supposed to be but I...,0,...,4.0,0.0,0.23168,0.0,0.027778,0.060185,0.0,0.1215,0.23168,0.0
6,7,__label1__,4,N,Toys,B00NUG865W,ESPN 2-Piece Table Tennis,PING PONG TABLE GREAT FOR YOUTHS AND FAMILY,Pleased with ping pong table. 11 year old and ...,0,...,5.0,0.0,0.0,0.11041,0.095238,0.059524,0.01396,0.0,0.0,0.11041
7,8,__label1__,4,Y,Beauty,B00QUL8VX6,Abundant Health 25% Vitamin C Serum with Vitam...,Great vitamin C serum,Great vitamin C serum... I really like the oil...,0,...,9.0,0.0,0.0,0.38425,0.0625,0.040179,0.14375,0.0,0.0,0.38425
8,9,__label1__,4,N,Health & Personal Care,B004YHKVCM,PODS Spring Meadow HE Turbo Laundry Detergent ...,wonderful detergent.,I've used tide pods laundry detergent for many...,0,...,7.0,0.0,0.32881,0.0,0.051136,0.045455,0.22687,0.0,0.32881,0.0
9,10,__label1__,1,N,Health & Personal Care,B00H4IBD0M,"Sheer TEST, Best Testosterone Booster Suppleme...",WARNING: do not waste your money on this,Everybody wants to fall for their promises. Bu...,0,...,0.0,-0.0,0.10118,0.0,0.026515,0.037879,0.0,0.30493,0.10118,0.0


In [4]:
num_fake = len(df[df['target'] == 0])
num_real = len(df[df['target'] == 1])

print(num_real, num_fake)

10500 10500


As seen above, the dataset is evenly balanced across both classes.

# Review Text Preprocessing

In [4]:
tokenizer = RegexpTokenizer(r'\w+')

# converting to lowercase and tokenizing
review_tokens = [tokenizer.tokenize(review.lower()) for review in df['REVIEW_TEXT']]

#removing special characters
review_tokens = [[unidecode(token) for token in review if token.isalnum()] for review in review_tokens]
" ".join(review_tokens[0])

'when least you think so this product will save the day just keep it around just in case you need it for something'

# Emotion Representation

### Polarity: OpinionFinder 2.0

In [20]:
# DO NOT RUN THIS

# OpinionFinder2.0: Tags words with polarity (pos/neg)
# Used t o develop two features: OPI_FIN_POS and OPI_FIN_NEG
# defined as the number of words that corresponding to each polarity respectively, per review

# parent_dir = "database/docs/amazon_reviews/"
# f_count = 1
# count = 0
# doclist = "amazon_reviews_" + str(f_count) + ".doclist"
# f2 = open(doclist, "a")

# for i in range(len(review_tokens)):
#     fname = parent_dir + "rev_id_" + str(i + 1)
#     fp = open(fname, 'w')
#     review_text = ' '.join(review_tokens[i])
#     fp.write(review_text)
#     fp.close()
    
#     if count == 2100:
#         f2.close()
#         count = 0
#         f_count += 1
        
#         doclist = "amazon_reviews_" + str(f_count) + ".doclist"
#         f2 = open(doclist, "a")
        
#     f2.write(fname+"\n")         
#     count += 1
    
# f2.close()





# commands to execute OpinionFinder2.0

# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_1.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_2.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_3.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_4.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_5.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_6.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_7.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_8.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_9.doclist -d
# !java -Xmx1g -classpath lib\weka.jar;lib\stanford-postagger.jar;opinionfinder.jar opin.main.RunOpinionFinder amazon_reviews_10.doclist -d





# extracting polarity labels from output file (exp_polarity.txt) and adding to dataset

# opinion_finder_pos_count = []
# opinion_finder_neg_count = []

# parent_dir = "database/docs/amazon_reviews/rev_id_"
# suffix = "_auto_anns/exp_polarity.txt"

# for i in range(len(review_tokens)):
#     fpath = parent_dir + str(i + 1) + suffix
#     f = open(fpath, "r")
#     content = f.read()
#     f.close()
    
#     opinion_finder_pos_count.append(content.count("positive"))
#     opinion_finder_neg_count.append(content.count("negative"))
    

# df['OPI_FIN_POS'] = opinion_finder_pos_count
# df['OPI_FIN_NEG'] = opinion_finder_neg_count
# df.to_csv("amazon_reviews_with_polarity.txt", sep = "\t", index = False)

### Polarity: Bing Liu's Lexicon

In [None]:
# DO NOT RUN THIS

# Bing Liu et al: Opinion Lexicon for positive and negative polarity tagging of words
# Used to develop 2 features: BL_POS and BL_NEG
# defined as the number of words that corresponding to each polarity respectively, per review

# dir_name = "../bing-liu-opinion-lexicon-English/"
# pos_file = dir_name + "positive-words.txt"
# neg_file = dir_name + "negative-words.txt"

# f1 = open(pos_file, "r")
# f2 = open(neg_file, "r")

# pos_lexicon = f1.read()
# neg_lexicon = f2.read()

# f1.close()
# f2.close()

# bl_pos = []
# bl_neg = []

# for review in review_tokens:
#     count_pos = 0
#     count_neg = 0
    
#     for token in review:
#         if token in pos_lexicon:
#             count_pos += 1
#         if token in neg_lexicon:
#             count_neg += 1
            
#     bl_pos.append(count_pos)
#     bl_neg.append(count_neg)
    
# print(bl_pos[:15])
# print(bl_neg[:15])

# df['BL_POS'] = bl_pos
# df['BL_NEG'] = bl_neg

# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

### Strength Score: AFINN

In [None]:
# #!pip install afinn

# # AFINN: Sentiment lexicon for measuring the positive and negative score of a review
# # Used to develop 2 features: BL_POS and BL_NEG

# from afinn import Afinn
# afn = Afinn()

# afinn_pos = []
# afinn_neg = []

# for review in review_tokens:
#     review = " ".join(review)
#     s = afn.score(review)
    
#     if s > 0:
#         afinn_pos.append(s)
#         afinn_neg.append(0.0)
        
#     else:
#         afinn_pos.append(0.0)
#         afinn_neg.append(-1 * s)
        
# print(afinn_pos[:15])
# print(afinn_neg[:15])

In [None]:
# df['AFINN_POS'] = afinn_pos
# df['AFINN_NEG'] = afinn_neg
# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

### Strength Score: Sentiment140

In [21]:
# Sentiment140: Lexicon for measuring the positive and negative score of a review
# Used to develop 2 features: S140_POS and S140_NEG

# import csv

# dir_name = "../Sentiment140-Lexicon/"
# f1 = "unigrams-pmilexicon.txt"
# f2 = "bigrams-pmilexicon.txt"


# uni_lex = pd.read_csv(dir_name + f1, sep = "\t")
# bi_lex = pd.read_csv(dir_name + f2, sep = "\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

# uni_dict = dict(zip(uni_lex["term"], uni_lex["score"]))
# bi_dict = dict(zip(bi_lex["term"], bi_lex["score"]))

# s140_pos = []
# s140_neg = []

# for review in review_tokens:
#     score = 0
#     uni_score = 0
#     uni_c = 0
        
#     for unigram in review:
#         if unigram in uni_dict:
#             uni_score += uni_dict[unigram]
#             uni_c += 1
    
#     if uni_c > 0:
#         uni_score /= uni_c
    
    
#     bi_score = 0
#     bi_c = 0
    
#     bigrams = list(ngrams(review, 2))
#     for bigram in bigrams:
#         text = " ".join(bigram)
#         if text in bi_dict:
#             bi_score += bi_dict[text]
#             bi_c += 1
    
#     if bi_c > 0:
#         bi_score /= bi_c
    
    
#     score = (bi_score + uni_score) / (int(uni_c > 0) + int(bi_c > 0))
    
#     if score > 0:
#         s140_pos.append(round(score, 5))
#         s140_neg.append(0.0)
        
#     else:
#         s140_pos.append(0.0)
#         s140_neg.append(round(-1 * score, 5))
    
    
# print(s140_pos[:15])
# print(s140_neg[:15])

# print(s140_pos.count(0.0), s140_neg.count(0.0), len(s140_pos))

In [None]:
# df['S140_POS'] = s140_pos
# df['S140_NEG'] = s140_neg
# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

### Strength Score: SentiWordNet3.0

In [None]:
# SentiWordNet3.0: Lexicon for measuring the positive and negative score of synsets in WordNet 3.0
# Used to develop 2 features: SWN_POS and SWN_NEG
#(for this one, try to make the scores exclusive like the others)


# # import nltk
# # nltk.download('sentiwordnet')

# from nltk.corpus import sentiwordnet as swn

# swn_pos = []
# swn_neg = []

# for review in review_tokens:
#     pos_score = 0
#     neg_score = 0
#     count = 0
    
#     for term in review:
#         res = swn.senti_synsets(term)
        
#         try:
#             res0 = list(res)[0]
#             pos_score += res0.pos_score()
#             neg_score += res0.neg_score()
#             count += 1
            
#         except:
#             pass
    
#     if count > 0:
#         pos_score = pos_score / count
#         neg_score = neg_score / count
        
#     swn_pos.append(pos_score)
#     swn_neg.append(neg_score)

# print(swn_pos[:15])
# print(swn_neg[:15])

In [None]:
# df['SWN_POS'] = swn_pos
# df['SWN_NEG'] = swn_neg
# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

### Strength Score: NRC Hashtag

In [None]:
# NRC Hashtag: Lexicon for measuring the positive and negative score of a review based on twitter hashtags
# Used to develop 2 features: NRC_HASH_POS and NRC_HASH_NEG


# import csv

# dir_name = "../NRC-Hashtag-Sentiment-Lexicon-v0.1/"
# f1 = "unigrams-pmilexicon.txt"
# f2 = "bigrams-pmilexicon.txt"

# uni_lex = pd.read_csv(dir_name + f1, sep = "\t")
# bi_lex = pd.read_csv(dir_name + f2, sep = "\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

# uni_dict = dict(zip(uni_lex["term"], uni_lex["score"]))
# bi_dict = dict(zip(bi_lex["term"], bi_lex["score"]))

# nrc_hash_pos = []
# nrc_hash_neg = []

# for review in review_tokens:
#     score = 0
#     uni_score = 0
#     uni_c = 0
        
#     for unigram in review:
#         if unigram in uni_dict:
#             uni_score += uni_dict[unigram]
#             uni_c += 1
    
#     if uni_c > 0:
#         uni_score /= uni_c
    
    
#     bi_score = 0
#     bi_c = 0
    
#     bigrams = list(ngrams(review, 2))
#     for bigram in bigrams:
#         text = " ".join(bigram)
#         if text in bi_dict:
#             bi_score += bi_dict[text]
#             bi_c += 1
    
#     if bi_c > 0:
#         bi_score /= bi_c
    
    
#     score = (bi_score + uni_score) / (int(uni_c > 0) + int(bi_c > 0))
    
#     if score > 0:
#         nrc_hash_pos.append(round(score, 5))
#         nrc_hash_neg.append(0.0)
        
#     else:
#         nrc_hash_pos.append(0.0)
#         nrc_hash_neg.append(round(-1 * score, 5))
    
    
# print(nrc_hash_pos[:15])
# print(nrc_hash_neg[:15])

In [10]:
# df['NRC_HASH_POS'] = nrc_hash_pos
# df['NRC_HASH_NEG'] = nrc_hash_neg
# df.to_csv("amazon_reviews_features.txt", sep = "\t", index = False)

### Strength Score: NRC Emoticon (USELESS TBH)

In [13]:
# # NRC Emoticon: Lexicon for measuring the positive and negative score of a review based on emojis
# # Used to develop 2 features: NRC_EMOTICON_POS and NRC_EMOTICON_NEG


# import csv

# dir_name = "../NRC-Emoticon-Lexicon-v1.0/"
# f1 = "Emoticon-unigrams.txt"
# f2 = "Emoticon-bigrams.txt"

# uni_lex = pd.read_csv(dir_name + f1, sep = "\t")
# bi_lex = pd.read_csv(dir_name + f2, sep = "\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

# uni_dict = dict(zip(uni_lex["term"], uni_lex["score"]))
# bi_dict = dict(zip(bi_lex["term"], bi_lex["score"]))

# nrc_emoticon_pos = []
# nrc_emoticon_neg = []

# for review in review_tokens:
#     score = 0
#     uni_score = 0
#     uni_c = 0
        
#     for unigram in review:
#         if unigram in uni_dict:
#             uni_score += uni_dict[unigram]
#             uni_c += 1
    
#     if uni_c > 0:
#         uni_score /= uni_c
    
    
#     bi_score = 0
#     bi_c = 0
    
#     bigrams = list(ngrams(review, 2))
#     for bigram in bigrams:
#         text = " ".join(bigram)
#         if text in bi_dict:
#             bi_score += bi_dict[text]
#             bi_c += 1
    
#     if bi_c > 0:
#         bi_score /= bi_c
    
    
#     score = (bi_score + uni_score) / (int(uni_c > 0) + int(bi_c > 0))
    
#     if score > 0:
#         nrc_emoticon_pos.append(round(score, 5))
#         nrc_emoticon_neg.append(0.0)
        
#     else:
#         nrc_emoticon_pos.append(0.0)
#         nrc_emoticon_neg.append(round(-1 * score, 5))
    
    
# print(nrc_emoticon_pos[:15])
# print(nrc_emoticon_neg[:15])

[0.21302, 0.0, 0.0, 0.0, 0.0, 0.23168, 0.0, 0.0, 0.32881, 0.10118, 0.0, 0.30214, 0.0, 0.0, 0.0]
[0.0, 0.14293, 0.06157, 0.20638, 0.20035, 0.0, 0.11041, 0.38425, 0.0, 0.0, 0.72399, 0.0, 0.02778, 0.25092, 0.03157]


In [None]:
# DO NOT RUN THIS

# reference for sentiment analysis with SentiWordNet

# import nltk
# nltk.download('sentiwordnet')
# nltk.download('wordnet')
# from nltk.corpus import wordnet as wn
# from nltk.corpus import sentiwordnet as swn
# list(swn.senti_synsets('slow'))

# sentence='It was a really good day'
# from nltk.tag import pos_tag
# token = nltk.word_tokenize(sentence)
# after_tagging = nltk.pos_tag(token)
# print (token)
# print (after_tagging)
# def penn_to_wn(tag):
#     """
#     Convert between the PennTreebank tags to simple Wordnet tags
#     """
#     if tag.startswith('J'):
#         return wn.ADJ
#     elif tag.startswith('N'):
#         return wn.NOUN
#     elif tag.startswith('R'):
#         return wn.ADV
#     elif tag.startswith('V'):
#         return wn.VERB
#     return None
# sentiment = 0.0
# tokens_count = 0
# from nltk.stem import WordNetLemmatizer
# lemmatizer = WordNetLemmatizer()
# for word, tag in after_tagging:
#             wn_tag = penn_to_wn(tag)
#             if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
#                 continue

#             lemma = lemmatizer.lemmatize(word, pos=wn_tag)
#             if not lemma:
#                 continue

#             synsets = wn.synsets(lemma, pos=wn_tag)
#             if not synsets:
#                 continue

#             # Take the first sense, the most common
#             synset = synsets[0]
#             swn_synset = swn.senti_synset(synset.name())
#             print(swn_synset)

#             sentiment += swn_synset.pos_score() - swn_synset.neg_score()
#             tokens_count += 1

# print (sentiment)

# Stop Word Removal

In [5]:
# removing stop words
# import nltk
# nltk.download('stopwords')

stop_words = set(stopwords.words("english"))
content_review_tokens = [[token for token in review if token not in stop_words and token.isalnum()] for review in review_tokens]

print("Before stop word removal: ", review_tokens[6914])
print()
print("After stop word removal: ", content_review_tokens[6914])

Before stop word removal:  ['love', 'the', 'bottle', 'very', 'much', 'br', 'br', 'iVm', 'a', 'tea', 'lover', 'when', 'i', 'saw', 'this', 'bottle', 'i', 'knew', 'that', 'it', 'was', 'what', 'i', 'wanted', 'the', 'shape', 'is', 'fantastic', 'feels', 'nice', 'in', 'your', 'hand', 'perfect', 'size', 'to', 'have', 'in', 'my', 'car', 'i', 'took', 'it', 'all', 'around', 'so', 'i', 'can', 'enjoy', 'my', 'tea', 'everywhere', 'love', 'it', 'very', 'much', 'the', 'one', 'with', 'infuser', 'also', 'looks', 'good']

After stop word removal:  ['love', 'bottle', 'much', 'br', 'br', 'iVm', 'tea', 'lover', 'saw', 'bottle', 'knew', 'wanted', 'shape', 'fantastic', 'feels', 'nice', 'hand', 'perfect', 'size', 'car', 'took', 'around', 'enjoy', 'tea', 'everywhere', 'love', 'much', 'one', 'infuser', 'also', 'looks', 'good']


In [None]:
# STEMMING AND LEMMATIZATION IS NOT REQUIRED I THINK, BECAUSE THE PAPER DOES NOT MENTION IT

# from nltk.stem import SnowballStemmer     #porter 2 algorithm
# snowball = SnowballStemmer(language = "english")
# content_review_tokens = [[snowball.stem(token) for token in review] for review in content_review_tokens]
# print(content_review_tokens[374])

# from nltk.stem import WordNetLemmatizer
# lemmatizer = WordNetLemmatizer()
# content_review_tokens = [[lemmatizer.lemmatize(token) for token in review] for review in content_review_tokens]
# print(content_review_tokens[374])

# N-Gram Modelling

In [6]:
review_text_unigrams = [list(ngrams(tokens, 1)) for tokens in content_review_tokens]
review_text_bigrams = [list(ngrams(tokens, 2)) for tokens in content_review_tokens]
review_text_trigrams = [list(ngrams(tokens, 3)) for tokens in content_review_tokens]

print(review_text_unigrams[374])
print(review_text_bigrams[374])
print(review_text_trigrams[374])

[('brushes',), ('soft',), ('soon',), ('first',), ('usage',), ('see',), ('bristles',), ('coming',), ('worth',), ('purchase',), ('falling',), ('generic',), ('product',)]
[('brushes', 'soft'), ('soft', 'soon'), ('soon', 'first'), ('first', 'usage'), ('usage', 'see'), ('see', 'bristles'), ('bristles', 'coming'), ('coming', 'worth'), ('worth', 'purchase'), ('purchase', 'falling'), ('falling', 'generic'), ('generic', 'product')]
[('brushes', 'soft', 'soon'), ('soft', 'soon', 'first'), ('soon', 'first', 'usage'), ('first', 'usage', 'see'), ('usage', 'see', 'bristles'), ('see', 'bristles', 'coming'), ('bristles', 'coming', 'worth'), ('coming', 'worth', 'purchase'), ('worth', 'purchase', 'falling'), ('purchase', 'falling', 'generic'), ('falling', 'generic', 'product')]


In [14]:
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Train the Skip-Gram model
vector_size = 100  # Dimensionality of word embeddings
window_size = 5    # Maximum distance between the current and predicted word within a sentence
min_count = 1      # Minimum frequency count of words to consider when training the model
workers = 4        # Number of threads to use while training

# Train the Skip-Gram model
skipgram_model = Word2Vec(sentences=content_review_tokens,
                          vector_size=vector_size,
                          window=window_size,
                          min_count=min_count,
                          workers=workers,
                          sg=1)  # sg=1 specifies Skip-Gram model

# Save the trained word embeddings
# skipgram_model.save('skipgram_word_embeddings.model')

#skipgram_model.train([["hello", "world"]], total_examples=1, epochs=1)



In [15]:
print(skipgram_model.wv['computer'])
print(skipgram_model.wv['laptop'])

[-0.02099521  0.09561954  0.0548805  -0.04881383 -0.07118601 -0.6665906
  0.64163786  0.70358986 -0.30299774  0.066332    0.2826829  -0.53231275
 -0.2839308  -0.21144377  0.12996304  0.25996113 -0.20663016 -0.3951933
 -0.07755546 -0.22268693 -0.01645495  0.45674834  0.1858662  -0.25952634
  0.06009199  0.15379114 -0.29662326 -0.0759635  -0.359151   -0.10081615
  0.20416364 -0.17115799  0.03937411 -0.43139616 -0.3961922  -0.03462957
  0.33540177 -0.26132286  0.08713812 -0.2932426   0.16218199 -0.01687521
 -0.20076196 -0.19930044  0.37899625  0.13617526 -0.244787    0.32846168
  0.42483485  0.11272307 -0.06090441 -0.56031215 -0.10054269  0.06896328
  0.14094675  0.09178472  0.3212381  -0.13634907 -0.11138833 -0.11977422
 -0.03117456 -0.02779451 -0.46357426 -0.04491124  0.03956963  0.53614825
  0.00466609  0.52460355 -0.3753548   0.35372722 -0.20420942  0.54145205
  0.73458654 -0.2379491   0.3407574   0.3601199  -0.14823644 -0.1386738
 -0.08234764 -0.07445995 -0.32587016 -0.39289358 -0.36

In [None]:
# 2. Model Architecture Design:

# DFFNN Model:
# Design a multilayer perceptron neural network with two hidden layers.
# Determine the input layer size based on the features extracted in data preprocessing (e.g., 2000 n-grams, 30 emotion features, and word embeddings).
# Define the number of neurons in each hidden layer based on a grid search procedure.
# Choose rectified linear units as the activation function for the hidden layers.
# Implement dropout regularization to prevent overfitting.
# Utilize softmax activation in the output layer for binary classification (fake/truthful).


# DFFNN Model

In [None]:
#DFFNN Model:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Assuming you have the features extracted in data preprocessing stored in X
# and the binary labels in y (0 for fake, 1 for truthful)

# Input layer size based on features
input_layer_size = len(df.columns)  # Adjust based on the actual number of features

# Hyperparameters
hidden_layer1_neurons = 128
hidden_layer2_neurons = 64
dropout_rate = 0.5  # Adjust as needed

# Define the DFFNN model
model = Sequential()

# Input layer
model.add(Dense(hidden_layer1_neurons, input_dim=input_layer_size, activation='relu'))
model.add(Dropout(dropout_rate))

# First hidden layer
model.add(Dense(hidden_layer2_neurons, activation='relu'))
model.add(Dropout(dropout_rate))

# Second hidden layer
model.add(Dense(hidden_layer2_neurons, activation='relu'))
model.add(Dropout(dropout_rate))

# Output layer (binary classification with softmax activation)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()


# CNN Model

In [None]:

# CNN Model:
# Design a convolutional neural network architecture.
# Convert each sentence into a k-dimensional word representation using pre-trained word embeddings.
# Concatenate word representations to obtain fixed-size input.
# Define the number of filters in the convolutional layer and the size of the filter.
# Utilize rectified linear units as the activation function for the convolutional layer.
# Implement max pooling to downsample the feature maps.
# Use softmax activation in the output layer for binary classification.

In [None]:
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.models import Sequential
from collections import Counter
import pandas as pd

# Hyperparameters
embedding_dim = 100  # Dimensionality of word embeddings
max_len = 100  # Maximum sequence length (number of words in a review)
num_filters = 128  # Number of filters in the convolutional layer
filter_size = 5  # Size of the filter window


# Define the CNN model
model = Sequential()
vocab_size=1000

# Embedding layer
model.add(Embedding(vocab_size, embedding_dim, input_shape=(max_len,)))

# Convolutional layer
model.add(Conv1D(filters=num_filters, kernel_size=filter_size, activation='relu'))

# Max pooling layer
model.add(MaxPooling1D(pool_size=2))

# Global max pooling layer
model.add(GlobalMaxPooling1D())

# Dense layer
model.add(Dense(units=64, activation='relu'))

# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()


In [None]:
# 3. Model Training:
# Split the dataset into training, validation, and testing sets.
# Use mini-batch gradient descent for training the DFFNN model.
# Apply stochastic gradient descent for training the CNN model.
# Tune hyperparameters such as learning rate, dropout rate, and number of iterations using validation set performance.
# Monitor training progress and adjust hyperparameters as needed to prevent overfitting.

In [None]:
#I'm trying idk

In [None]:
# 4. Evaluation:
# Evaluate the trained models on the test set to measure their performance.
# Compute metrics such as accuracy, precision, recall, and F1-score to assess the models' effectiveness in detecting fake reviews.
# Compare the performance of the DFFNN and CNN models with baseline methods and state-of-the-art approaches mentioned in the paper.


In [None]:
# 5. Optimization and Fine-tuning:
# Experiment with different model architectures, hyperparameters, and training strategies to improve performance.
# Consider techniques such as ensemble learning or transfer learning to further enhance model accuracy.
# Fine-tune the models based on insights gained from initial evaluations and analyses.
# By following these steps, you can implement the proposed DFFNN and CNN models for fake review detection based on the ideas presented in the paper. Remember to document your process thoroughly and validate your results to ensure the reliability and reproducibility of your findings.