In [7]:
import numpy as np
import pandas as pd

file_path = "../../data/raw/yp_competitors_rws.csv"
file_path = "../../data/processed/yp_kimos-maui-lahaina_rws.csv"
df = pd.read_csv(file_path) 

In [8]:
## imports
from itertools import chain
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk import FreqDist
from nltk import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

In [9]:
df.head()

Unnamed: 0,status,reviews
0,1,Kimo's never disappoints. We come here once o...
1,0,I was there the first week of October and Firs...
2,1,"This place was on my ""Must Do Maui"" list and i..."
3,0,Monday night dinner here and they quickly sat ...
4,1,Nice view and amazing cocktails. They are loca...


## Tokenization

In [10]:
df['sent_tokens'] = df.reviews.apply(lambda x: sent_tokenize(x))

In [11]:
df['word_tokens'] = df.sent_tokens.apply(
    lambda x: [w
               for s in x
               for w in word_tokenize(s) if w.isalpha()]
)

In [12]:
df['pos_tag'] = df.word_tokens.apply(lambda x: pos_tag(x))

## Stopwords

In [13]:
tokens = chain(*df.word_tokens)
tokens = [w for w in tokens if w.lower() not in stopwords.words('english')]

In [14]:
len(tokens)

113970

## Vocabulary
- lowercased
- punctuation removed
- stopwords removed

In [15]:
vocab = sorted(set(w.lower() for w in tokens))
tokens_freq = FreqDist(w.lower() for w in tokens)

In [16]:
len(vocab)

7239

In [17]:
len(tokens_freq)

7239

In [18]:
# lexicality score
len(vocab) / len(tokens) 

0.06351671492498026

In [19]:
print(tokens_freq.most_common(20))

[('great', 1716), ('food', 1680), ('good', 1649), ('fish', 1531), ('service', 1105), ('view', 1056), ('kimo', 968), ('pie', 936), ('hula', 867), ('place', 860), ('maui', 793), ('dinner', 746), ('us', 686), ('would', 646), ('ordered', 603), ('get', 599), ('really', 593), ('one', 581), ('time', 575), ('back', 573)]


## Normalization

In [20]:
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()
vocab2 = [porter.stem(lemmatizer.lemmatize(w)) for i, w in enumerate(vocab)]
# vocab2 = [lemmatizer.lemmatize(porter.stem(w)) for i, w in enumerate(vocab)]

In [21]:
vocab2_freq = FreqDist(vocab2)

In [22]:
len(vocab2)

7239

In [23]:
len(vocab2_freq)

5226

In [24]:
print(vocab2_freq.most_common(20))

[('manag', 8), ('consist', 7), ('season', 7), ('care', 6), ('celebr', 6), ('enjoy', 6), ('indulg', 6), ('person', 6), ('prepar', 6), ('recommend', 6), ('review', 6), ('select', 6), ('suggest', 6), ('travel', 6), ('arriv', 5), ('compar', 5), ('compens', 5), ('disappoint', 5), ('except', 5), ('expect', 5)]


## Apply to each document in corpora

In [21]:
df['norm_tokens'] = df.word_tokens.apply(
    lambda x: [porter.stem(lemmatizer.lemmatize(w.lower()))
                                  for w in x if w.lower() not in stopwords.words('english')]
)

In [22]:
df.head()

Unnamed: 0,status,reviews,sent_tokens,word_tokens,pos_tag,norm_tokens
0,1,Kimo's never disappoints. We come here once o...,"[Kimo's never disappoints., We come here once ...","[Kimo, never, disappoints, We, come, here, onc...","[(Kimo, NNP), (never, RB), (disappoints, VBZ),...","[kimo, never, disappoint, come, everi, trip, m..."
1,0,I was there the first week of October and Firs...,[I was there the first week of October and Fir...,"[I, was, there, the, first, week, of, October,...","[(I, PRP), (was, VBD), (there, RB), (the, DT),...","[first, week, octob, first, think, staff, wond..."
2,1,"This place was on my ""Must Do Maui"" list and i...","[This place was on my ""Must Do Maui"" list and ...","[This, place, was, on, my, Must, Do, Maui, lis...","[(This, DT), (place, NN), (was, VBD), (on, IN)...","[place, must, maui, list, disappoint, must, co..."
3,0,Monday night dinner here and they quickly sat ...,[Monday night dinner here and they quickly sat...,"[Monday, night, dinner, here, and, they, quick...","[(Monday, NNP), (night, NN), (dinner, NN), (he...","[monday, night, dinner, quickli, sat, group, u..."
4,1,Nice view and amazing cocktails. They are loca...,"[Nice view and amazing cocktails., They are lo...","[Nice, view, and, amazing, cocktails, They, ar...","[(Nice, NNP), (view, NN), (and, CC), (amazing,...","[nice, view, amaz, cocktail, locat, right, lah..."


In [24]:
df.to_csv('../../data/processed/yp_kimos-maui-lahaina_rws_preprocessed.csv', index=False)

In [25]:
print([1 if w in vocab else 0 for w in df.norm_tokens[0]])

[1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]


In [23]:
df.apply(lambda x: x.reviews, axis=1).head()

0    Kimo's never disappoints.  We come here once o...
1    I was there the first week of October and Firs...
2    This place was on my "Must Do Maui" list and i...
3    Monday night dinner here and they quickly sat ...
4    Nice view and amazing cocktails. They are loca...
dtype: object

In [28]:
type(df.word_tokens[0])

list

## classifier test

In [37]:
import nltk
from nltk.corpus import movie_reviews
import random
df.head()

documents = df.loc[:, ['norm_tokens', 'status']].values

random.shuffle(documents)

In [41]:
word_features = list(vocab2_freq)#[:2000]

In [42]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [43]:
# Train Naive Bayes classifier
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [44]:
# Test the classifier
print(nltk.classify.accuracy(classifier, test_set))

0.89


In [46]:
# Show the most important features as interpreted by Naive Bayes
classifier.show_most_informative_features(10)

Most Informative Features
          contains(rude) = True                0 : 1      =     58.2 : 1.0
          contains(poor) = True                0 : 1      =     43.9 : 1.0
        contains(switch) = True                0 : 1      =     41.6 : 1.0
       contains(mediocr) = True                0 : 1      =     32.1 : 1.0
          contains(wast) = True                0 : 1      =     29.2 : 1.0
          contains(flag) = True                0 : 1      =     22.6 : 1.0
         contains(stale) = True                0 : 1      =     22.6 : 1.0
        contains(explan) = True                0 : 1      =     20.2 : 1.0
         contains(empti) = True                0 : 1      =     18.8 : 1.0
         contains(knife) = True                0 : 1      =     17.8 : 1.0
