# IMPORTS

In [16]:
import re
from nltk.tokenize import word_tokenize
from string import punctuation 
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.corpus import stopwords
from nltk import punkt
import random as r
import pymysql

# DATA

In [18]:
import csv

input_file = csv.DictReader(open("ReviewData.csv",encoding='cp850'))
data = []
k = 0
for i in input_file:
    del i['']
    data.append(dict(i))

x = round(0.8*(len(data)))
data2 = data[:]

train_data = r.sample(data, x)
for i in train_data:
    data2.remove(i)
test_data = data2[:]

# print("TRAIN DATA")
# print(train_data,len(train_data))
# print("TEST DATA")
# print(test_data,len(test_data))

# MODEL

In [21]:
class PreProcessReviews:
    def __init__(self):
        self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
        
    def processReviews(self, list_of_reviews):
        processedReviews=[]
            
        for review in list_of_reviews:
            processedReviews.append((self._processReview(review["Review"]),review["Sentiment"]))
        return processedReviews
    
    def _processReview(self, review):
        review = review.lower() # convert text to lower-case
        review = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', review) # remove URLs
        review = re.sub('@[^\s]+', 'AT_USER', review) # remove usernames
        review = re.sub(r'#([^\s]+)', r'\1', review) # remove the # in #hashtag
        review = word_tokenize(review) # remove repeated characters (helloooooooo into hello)
        return [word for word in review if word not in self._stopwords]
    
reviewProcessor = PreProcessReviews()
preprocessedTrainingSet = reviewProcessor.processReviews(train_data)
# print(preprocessedTrainingSet[0])
def buildVocabulary(preprocessedTrainingData):
    all_words = []
    
    for (words, sentiment) in preprocessedTrainingData:
        all_words.extend(words)

    wordlist = nltk.FreqDist(all_words)
    word_features = wordlist.keys()
    
    return word_features
def extract_features(review):
    review_words = set(review)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in review_words)
    return features

word_features = buildVocabulary(preprocessedTrainingSet)
trainingFeatures = nltk.classify.apply_features(extract_features, preprocessedTrainingSet)
# print(trainingFeatures)
# print(word_features)

# TRAINING

In [4]:
NBayesClassifier = nltk.NaiveBayesClassifier.train(trainingFeatures)
print(NBayesClassifier)

<nltk.classify.naivebayes.NaiveBayesClassifier object at 0x000002ED281E94A8>


# TESTING

In [23]:
preprocessedTestingSet = reviewProcessor.processReviews(test_data)
test = []
result = []
for i in range(len(data)-x):
    t = r.choice(preprocessedTestingSet)
    preprocessedTestingSet.remove(t)
    test.append(t[0])
    result.append(t[1])
# print(test)
# print(result)
k = 0
p = 0
for i in test:
    NB = NBayesClassifier.classify(extract_features(i))
#     print(NB," ",result[k])
    if NB == result[k]:
        p = p + 1
    k = k + 1
accuracy = p*100/k
print("ACCURACY =")
print(accuracy)

(['wonderful', 'relaxed', 'stay', 'huntley', 'hotel', "n't", 'staying', 'priciest', 'room', 'staff', 'treated', 'like', 'reserved', 'penthouse', 'desk', 'clerk', 'even', 'helped', 'locate', 'wallet', 'left', 'hotel', 'restaurant', 'addition', 'beautiful', 'rooms', 'kind', 'staff', 'huntley', 'boasts', 'location', 'barely', 'two', 'blocks', 'santa', 'monica', 'beach', "n't", 'enjoying', 'amenities', 'hotel', 'lounging', 'sun'], 'T')
(['hotels', 'area', 'either', 'fully', 'booked', 'offering', 'sky-high', 'rates', 'extended', 'stay', 'america', 'burbank', 'great', 'option', 'ample', 'parking', 'easy', 'access', 'local', 'freeways', 'room', 'clean', 'comfortable', 'best', 'part', 'kitchen', 'carried', 'pans', 'dishes', 'utensils', 'needed', 'right', 'huge', 'shopping', 'center', 'convenient'], 'T')
(['stayed', 'two', 'nights', 'recently', 'found', 'place', 'great', 'location', 'yes', 'room', 'small', 'bother', 'us', 'hardly', 'room', 'king', 'suite', 'called', 'mini', 'sauna', 'room', 'ro

ACCURACY =
97.67441860465117


# DATABASE

In [7]:
import pymysql
conn = pymysql.connect(host="localhost",user="root",passwd="",db="fake_feedback")
cur = conn.cursor()
print("Database Connected")

try:
    d = "select * from reviews"
    cur.execute(d)
    review_data = cur.fetchall()
#     print(data)
    neg = "select * from negative_words"
    cur.execute(neg)
    neg_words = cur.fetchall()
#     print(neg_words)
except Exception as e:
    print("ERROR=",e)
    
review = []
ip_addr = []
negatives = []

for i in review_data:
    review.append(i[2])
    ip_addr.append(i[1])

for i in neg_words:
    negatives.append(i[1])

me_list = ['i','me','myself']

ln = ['.com','.org','.net','.gov']

# print(ip_addr)
# print(negatives)

Database Connected


# VERIFICATION

In [8]:
def verify_fake(ip,r):
    v = [0 for i in range(5)]
    rev1 = r.split(" ")
    rev = [i.lower() for i in rev1]
    nc = 0
    myc = 0
    linkc = 0
    
    ipc = ip_addr.count(ip)
    for i in negatives:
        nc = nc + rev.count(i)
    myc = 0
    for i in me_list:
        myc = myc + rev.count(i)
    for i in rev:
        for j in ln:
            if j in i:
                linkc = linkc + 1
    if ipc > 1:
        v[0] = 1
    if nc > 3:
        v[1] = 1
    if myc > 5:
        v[2] = 1
    if linkc > 0:
        v[3] = 1
    if r in review:
        v[4] = 1
#     print(nc,myc,linkc,ipc)
#     print(v)
    return v   

In [9]:
verify = ['Multiple reviews from same IP','Too many Negative Words','Self Promotion','Promotions via Links','Duplicate Review']

# IP GENERATION (FOR OFFLINE PURPOSES)

In [10]:
import random as r
def ip_generator():
    num = [i for i in range(255)]
    ip = str(r.choice(num))+"."+str(r.choice(num))+"."+str(r.choice(num))+"."+str(r.choice(num))
    return ip

# FAKE REVIEW DETECTION WITH ANALYSIS

In [12]:
import random as r
def review_detection(r_ip,ipr):
    result = ['','']
    fake_reasons = []
    
    if ipr == -1:
        return -1

    v = verify_fake(ipr,r_ip)
    if v.count(1)>=3:
        result[0] = 'FAKE'
    else:
        result[0] = 'NOT FAKE'
    for i in range(5):
        if v[i] == 1:
            fake_reasons.append(verify[i])
    
    preprocessedReview = reviewProcessor._processReview(r_ip)
    NB = NBayesClassifier.classify(extract_features(preprocessedReview))
    
    if NB == 'T':
        result[1] = 'POSITIVE'
    elif NB == 'F':
        result[1] = 'NEGATIVE'
    else:
        result[1] = 'UNDEFINED'
    
    result.append(fake_reasons)
    
    return result

# EXAMPLE

In [15]:
r_ip = input('Enter Review:-')
ip_ip = int(input('0 for repeat ip; 1 for new ip:-'))

if ip_ip == 0:
    ipr = r.choice(ip_addr)
elif ip_ip == 1:
    ipr = ip_generator()
else: 
    print("Invalid option")
    ipr = -1

# print(ipr)

answer = review_detection(r_ip,ipr)

if answer == -1:
    print("ERROR")
else:
    print()
    print()
    print("REVIEW CLASSIFICATION:")
    print(answer[1])
    print(answer[0])
    print()
    if answer[0] == 'FAKE':
        print("REASONS:")
        for i in answer[2]:
            print(i)
    if answer[0] == 'NOT FAKE':
        print("RAISED RED FLAGS:")
        for i in answer[2]:
            print(i)
#     print(answer)

Enter Review:-sgkahjsgjhdsgjhsdaf
0 for repeat ip; 1 for new ip:-0


REVIEW CLASSIFICATION:
NEGATIVE
NOT FAKE

RAISED RED FLAGS:
Multiple reviews from same IP


In [None]:
# pathetic service, unhygenic and disgusting. We did not have a good time. never visit. Very Bad. visit xyz.com

# ADDITION OF NEW DATA TO DB

In [None]:
#Add review and IP to database