In [2]:
import pandas as pd
import numpy as np
# import nltk
# from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier


In [3]:
df = pd.read_csv('../local/az_reduced_reviews.csv')
az_biz = pd.read_csv('../local/az_biz.csv')

In [4]:
x_df = df[df.stars != 3]
#ignore warning
x_df['binary_stars'] =  np.where(x_df['stars'] >= 4, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(x_df.text, x_df.binary_stars, test_size=0.20, random_state=42)

In [9]:
#quick pipeline to show best performing nb

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),  
                     ('clf', MultinomialNB()),
                    ])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)
#90.61% accurate - w/o stopword filter
#90.80% accurate stop_words='english'

0.9080282515498257

In [10]:
#quick pipeline to show best performing svm

text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
                         ('tfidf', TfidfTransformer()), 
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                        ])

text_clf_svm = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == y_test)
# 89.266% accurate
# 87.47% accurate stop_words='english'



0.8747679116676433

In [None]:
#Vectorize the vocab
count_vect = CountVectorizer()
X_train_vec = CountVectorizer().fit(X_train)
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

In [None]:
#tfidf the vectorized corpus
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
#train a NB
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

In [None]:
#look at the top indicating positive and negative terms
linear_weights = nb.feature_log_prob_[1,] - nb.feature_log_prob_[0,]  # populate this with actual values

top_negative_features= np.argsort(linear_weights)[0:10]
top_positive_features= np.argsort(linear_weights)[-10:]

print("Most negative features:")
for idx in top_negative_features:
    for k, v in X_train_vec.vocabulary_.items():
        if v == idx:
            print("  {:s} ({:.02f})".format(k, linear_weights[idx]))
            
print("")
print("Most positive features:")
for idx in top_positive_features:
    for k, v in X_train_vec.vocabulary_.items():
        if v == idx:
            print("  {:s} ({:.02f})".format(k, linear_weights[idx]))


In [None]:
#view az data by business with most reviews
az_biz.sort_values('review_count', ascending=False)
# pSQFynH1VxkfSmehRXlZWw 

In [14]:
#run the same pipeline on a single business
single_business = x_df[x_df['business_id'] == 'pSQFynH1VxkfSmehRXlZWw']
sb_x = single_business['text']
y_sb = single_business['binary_stars']

count_vect = CountVectorizer()
sb_xtrain_vec = CountVectorizer().fit(sb_x)
sb_xtrain_counts = count_vect.fit_transform(sb_x)
sb_xtrain_counts.shape

tfidf_transformer = TfidfTransformer()
sb_xtrain_tfidf = tfidf_transformer.fit_transform(sb_xtrain_counts)
sb_xtrain_tfidf.shape

nb = MultinomialNB()
nb.fit(sb_xtrain_tfidf, y_sb)

linear_weights = nb.feature_log_prob_[1,] - nb.feature_log_prob_[0,]  # populate this with actual values

top_negative_features= np.argsort(linear_weights)[0:10]
top_positive_features= np.argsort(linear_weights)[-10:]

print("Most negative features:")
for idx in top_negative_features:
    for k, v in sb_xtrain_vec.vocabulary_.items():
        if v == idx:
            print("  {:s} ({:.02f})".format(k, linear_weights[idx]))
            
print("")
print("Most positive features:")
for idx in top_positive_features:
    for k, v in sb_xtrain_vec.vocabulary_.items():
        if v == idx:
            print("  {:s} ({:.02f})".format(k, linear_weights[idx]))

Most negative features:
  rude (-1.81)
  overrated (-1.69)
  mediocre (-1.58)
  worst (-1.51)
  burned (-1.50)
  overpriced (-1.44)
  disappointment (-1.39)
  attitude (-1.39)
  tasteless (-1.37)
  waste (-1.31)

Most positive features:
  margherita (1.71)
  excellent (1.78)
  rosa (1.80)
  fresh (1.83)
  loved (1.87)
  favorite (1.90)
  fantastic (1.97)
  amazing (2.00)
  perfect (2.07)
  delicious (2.24)


### Practice area below

In [None]:
df.head()

In [None]:
az_biz.head()

In [None]:
vec = CountVectorizer().fit(X_train)

In [None]:
top_positive_features

In [None]:
for idx in top_positive_features:
    for k, v in vec.vocabulary_.items():
        if v == idx:
            print(k)
    