# classify reviews

This notebook describes the binary classification of Yelp hotel reviews on whether or not they are dog related.

In [48]:
import numpy as np
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics

import pandas as pd
import connect_aws_db as cadb


In [49]:
%matplotlib inline

### Connect to DB

In [50]:
engine = cadb.connect_aws_db(write_unicode=True)

### Restore BF Reviews

In [51]:
cmd = "SELECT review_rating, review_text FROM bf_reviews"

In [52]:
bfdf = pd.read_sql_query(cmd, engine)

In [53]:
print(len(bfdf))
bfdf.head(5)

3039


Unnamed: 0,review_rating,review_text
0,2,When you first go to your room you notice the ...
1,2,We were going to the Pre-Westminster event tha...
2,2,While the room was not the fanciest that one c...
3,1,I stayed at the Hotel Penn in Manhattan becaus...
4,5,We loved this hotel! The are very friendly and...


### Restore Yelp Reviews

In [54]:
cmd = "SELECT review_rating, review_text FROM yelp_reviews"

In [55]:
yelpdf = pd.read_sql_query(cmd, engine)

In [56]:
print(len(yelpdf))
yelpdf.head(5)

6263


Unnamed: 0,review_rating,review_text
0,4,I stayed at the hotel for several months durin...
1,5,I have been coming to Pittsburgh for quite a w...
2,4,This is a very good hotel and with a corporate...
3,4,"i got upgraded to a junior suite, and then aga..."
4,5,This is a fantastic hotel. I went to a conven...


In [57]:
yelp_review_data = yelpdf['review_text'].values

In [58]:
train_data = np.hstack((bfdf['review_text'].values[:1500],
                        yelpdf['review_text'].values[:1500]))

In [59]:
labels = ['dog'] * 1500
labels.extend(['general'] * 1500)
y_train = labels

In [60]:
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
X_train = vectorizer.fit_transform(train_data)
duration = time() - t0
print('vectorized in {:.2f} seconds.'.format(duration))

vectorized in 0.37 seconds.


In [61]:
feature_names = np.asarray(vectorizer.get_feature_names())

In [62]:
penalty = 'l2'
clf = LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)

In [63]:
print(clf)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.001, verbose=0)


In [64]:
clf.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)

In [65]:
#yelp_review_data[:10]

In [66]:
X_yrevs = vectorizer.transform(yelp_review_data)

In [67]:
pred = clf.predict(X_yrevs)

In [68]:
pred.shape

(6263,)

In [69]:
# print the number of yelp hotel reviews that are identified as dog reviews:
len(np.where(pred == 'dog')[0])

483

In [70]:
ydogrevs = np.where(pred == 'dog')[0]

In [71]:
yelp_review_data[ydogrevs[4]]

u"When I first contacted the Windmill Inn, the receptionist was friendly and helpful at getting me a great room at a reasonable rate. As someone who always travels with my pets, I LOVE when hotels are pet friendly and offer complimentary pet accommodations. At the Windmill in, they also have a designated pet friendly section of the hotel, so all of your hotel neighbors are pet people too! No anxiety about your dog barking or bothering other pet-free guests, and quick access to the outdoors as the pet rooms are on the first floor. They even gave us a cute bag of dog treats along with our chocolate-chip cookies at check-in. :)\nThere is also a beautiful, grassy courtyard, with a pond that is full of koi, turtles, and cute ducks! The pool is in the center of this courtyard and is warm, well maintained, and handicap accessible. \nI thoroughly enjoyed my stay, and will definitely return the next time I'm in the area."