# IMPORTS

In [42]:
import re
from nltk.tokenize import word_tokenize
from string import punctuation 
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import random as r

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NitishTalekar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NitishTalekar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# DATA

In [6]:
import csv

input_file = csv.DictReader(open("ReviewData.csv",encoding='cp850'))
data = []
k = 0
for i in input_file:
    data.append(dict(i))
print(data)



In [51]:
x = round(0.8*(len(data)))
# train_data = data[:x]
# test_data = data[x:]
data2 = data[:]

train_data = random.sample(data, x)
for i in train_data:
    data2.remove(i)
test_data = data2[:]

In [52]:
print("TRAIN DATA")
print(train_data,len(train_data))
print("TEST DATA")
print(test_data,len(test_data))


TRAIN DATA
TEST DATA


# MODEL

In [53]:
class PreProcessReviews:
    def __init__(self):
        self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
        
    def processReviews(self, list_of_reviews):
        processedReviews=[]
            
        for review in list_of_reviews:
            processedReviews.append((self._processReview(review["Review"]),review["Sentiment"]))
        return processedReviews
    
    def _processReview(self, review):
        review = review.lower() # convert text to lower-case
        review = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', review) # remove URLs
        review = re.sub('@[^\s]+', 'AT_USER', review) # remove usernames
        review = re.sub(r'#([^\s]+)', r'\1', review) # remove the # in #hashtag
        review = word_tokenize(review) # remove repeated characters (helloooooooo into hello)
        return [word for word in review if word not in self._stopwords]

In [54]:
reviewProcessor = PreProcessReviews()
preprocessedTrainingSet = reviewProcessor.processReviews(train_data)

print(preprocessedTrainingSet[0])

(['like', 'motel', '6', 'affordable', "n't", 'bad', 'odor', 'like', 'motels', 'pretty', 'clean', 'rooms.they', 'pet', 'friendly', "n't", 'pay', 'extra', 'dog', 'money', 'saver', 'like', 'pool', 'middle', 'property', 'see', 'rooms', 'pool'], 'T')


In [55]:
def buildVocabulary(preprocessedTrainingData):
    all_words = []
    
    for (words, sentiment) in preprocessedTrainingData:
        all_words.extend(words)

    wordlist = nltk.FreqDist(all_words)
    word_features = wordlist.keys()
    
    return word_features

In [56]:
def extract_features(review):
    review_words = set(review)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in review_words)
    return features

In [57]:
word_features = buildVocabulary(preprocessedTrainingSet)
trainingFeatures = nltk.classify.apply_features(extract_features, preprocessedTrainingSet)

In [58]:
trainingFeatures



# TRAINING

In [59]:
NBayesClassifier = nltk.NaiveBayesClassifier.train(trainingFeatures)

In [60]:
print(NBayesClassifier)

<nltk.classify.naivebayes.NaiveBayesClassifier object at 0x000002C40FE0D908>


# TESTING

In [61]:
import random
preprocessedTestingSet = reviewProcessor.processReviews(test_data)
test = []
result = []
for i in range(len(data)-x):
    t = random.choice(preprocessedTestingSet)
    preprocessedTestingSet.remove(t)
    test.append(t[0])
    result.append(t[1])

In [62]:
print(test)
print(result)

['F', 'F', 'T', 'T', 'F', 'T', 'T', 'T', 'F', 'T', 'T', 'F', 'T', 'F', 'T', 'F', 'T', 'T', 'T', 'T', 'F', 'F', 'F', 'T', 'T', 'T', 'F', 'T', 'T', 'F', 'T', 'F', 'F', 'T', 'T', 'F', 'F', 'F', 'T', 'F', 'F', 'T', 'T', 'F', 'F', 'T', 'T', 'T', 'T', 'T', 'F', 'F', 'F', 'F', 'T', 'F', 'F', 'F', 'T', 'F', 'T', 'T', 'T', 'T', 'F', 'F', 'T', 'T', 'T', 'T', 'F', 'F', 'F', 'T', 'T', 'F', 'T', 'T', 'F', 'T', 'T', 'T', 'T', 'F', 'F', 'F']


In [63]:
k = 0
p = 0

print("Predictions AND Result")

for i in test:

    NB = NBayesClassifier.classify(extract_features(i))
    print(NB," ",result[k])
    if NB == result[k]:
        p = p + 1
    k = k + 1

Predictions AND Result
F   F
F   F
T   T
T   T
F   F
T   T
T   T
T   T
F   F
T   T
T   T
F   F
T   T
F   F
T   T
F   F
T   T
T   T
T   T
T   T
F   F
F   F
F   F
T   T
T   T
T   T
F   F
T   T
T   T
F   F
T   T
F   F
F   F
T   T
T   T
F   F
F   F
F   F
T   T
F   F
F   F
T   T
T   T
F   F
T   F
T   T
T   T
T   T
T   T
T   T
F   F
F   F
F   F
F   F
T   T
F   F
F   F
F   F
T   T
F   F
T   T
T   T
T   T
T   T
F   F
F   F
T   T
T   T
T   T
T   T
F   F
F   F
F   F
T   T
T   T
F   F
T   T
T   T
F   F
T   T
T   T
F   T
F   T
F   F
F   F
F   F


In [64]:
accuracy = p*100/k
print("ACCURACY =")
print(accuracy)

ACCURACY =
96.51162790697674


In [65]:
r1 = "We got stuck in Orlando Florida and the airline put us up in a Best Western. By the time we got to the hotel it was 2am, we were tired and cranky, and the male receptionist had sharpened his teeth and finger nails into a point -- very creepy. We we got to the room it appeared to not have been cleaned at all. The carpet was wet from the air conditioner to the front of the nearest bed, the lamp shades were bent/faded and there were used test strips on the desk -- looked like it was missed simply because the test strip was the same color of the desk. I took pictures and intended on e-mailing them and a complaint to the corporate office, but never got around to it. To make matters worse, we put the do not disturb sign on the door and the hotel staff knocked on the door at 9am, 10am and 11am wanting to clean the room --- our check out time wasn't until noon. I opened the door and gave a scolding look and said they were too late for cleaning."
r2 = "Went to this hotel when visiting nearby national parks with family. Front desk staff was courteous and quickly resolved any questions we had. Our 2-queen bed room was clean and spacious. Room has nice LCD TV, coffee maker, fridge, microwave, massaging shower heads, and good mattresses. We stayed for 2 nights and room was well cleaned and put together after first night with coffee packs refilled. Overall very satisfied."
r3 = "The room was unclean and pathetic"

preprocessedReview = reviewProcessor._processReview(r1)
preprocessedReview2 = reviewProcessor._processReview(r2)
preprocessedReview3 = reviewProcessor._processReview(r3)

NB1 = NBayesClassifier.classify(extract_features(preprocessedReview))
NB2 = NBayesClassifier.classify(extract_features(preprocessedReview2))
NB3 = NBayesClassifier.classify(extract_features(preprocessedReview3))

print(r1)
print(NB)
print()
print(r2)
print(NB2)
print()
print(r3)
print(NB3)


We got stuck in Orlando Florida and the airline put us up in a Best Western. By the time we got to the hotel it was 2am, we were tired and cranky, and the male receptionist had sharpened his teeth and finger nails into a point -- very creepy. We we got to the room it appeared to not have been cleaned at all. The carpet was wet from the air conditioner to the front of the nearest bed, the lamp shades were bent/faded and there were used test strips on the desk -- looked like it was missed simply because the test strip was the same color of the desk. I took pictures and intended on e-mailing them and a complaint to the corporate office, but never got around to it. To make matters worse, we put the do not disturb sign on the door and the hotel staff knocked on the door at 9am, 10am and 11am wanting to clean the room --- our check out time wasn't until noon. I opened the door and gave a scolding look and said they were too late for cleaning.
F

Went to this hotel when visiting nearby nation