In [None]:
import pandas as pd
import numpy as np
# import nltk
# from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer


In [None]:
df = pd.read_csv('../local/az_reduced_reviews.csv')
az_biz = pd.read_csv('../local/az_biz.csv')

In [None]:
x_df = df[df.stars != 3]
#ignore warning
x_df['binary_stars'] =  np.where(x_df['stars'] >= 4, 1, 0)

In [None]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(x_df.text, x_df.binary_stars, test_size=0.20, random_state=42)

In [None]:
#quick pipeline to show best performing nb

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=5, ngram_range=(2, 2))),
                     ('tfidf', TfidfTransformer()),  
                     ('clf', MultinomialNB()),
                    ])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)
#90.61% accurate - w/o stopword filter
#90.80% accurate stop_words='english'
#91.96% accurate CountVectorizer(stop_words='english', min_df=5)
#93.31% accurate  CountVectorizer(stop_words='english', min_df=5, ngram_range=(2, 2)

In [None]:
#quick pipeline to show best performing svm

text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
                         ('tfidf', TfidfTransformer()), 
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                        ])

text_clf_svm = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == y_test)
# 89.266% accurate
# 87.47% accurate stop_words='english'

In [None]:
#Vectorize the vocab
count_vect = CountVectorizer(stop_words='english', min_df=5)
X_train_vec = CountVectorizer(stop_words='english', min_df=5).fit(X_train)
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

In [None]:
X_train_vec = CountVectorizer(stop_words='english', min_df=5).fit(X_train)

In [None]:
#tfidf the vectorized corpus
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
#train a NB
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

In [None]:
#look at the top indicating positive and negative terms
linear_weights = nb.feature_log_prob_[1,] - nb.feature_log_prob_[0,]  # populate this with actual values

top_negative_features= np.argsort(linear_weights)[0:10]
top_positive_features= np.argsort(linear_weights)[-10:]

print("Most negative features:")
for idx in top_negative_features:
    for k, v in X_train_vec.vocabulary_.items():
        if v == idx:
            print("  {:s} ({:.02f})".format(k, linear_weights[idx]))
            
print("")
print("Most positive features:")
for idx in top_positive_features:
    for k, v in X_train_vec.vocabulary_.items():
        if v == idx:
            print("  {:s} ({:.02f})".format(k, linear_weights[v]))


In [None]:
# nb.feature_log_prob_.shape
# X_train_vec.vocabulary_.get('best')
idx = list(X_train_vec.vocabulary_.keys()).index('scraggly')
# nb.feature_log_prob_[1,][idx] - nb.feature_log_prob_[0,][idx]
print(idx)
# linear_weights[112533]

In [None]:
top_positive_features= np.argsort(linear_weights)[-1000:]
nb_score_dict = {}

# print("Most positive features:")
for idx in top_positive_features:
    for k, v in X_train_vec.vocabulary_.items():
        if v == idx:
#             print("  {:s} ({:.02f})".format(k, linear_weights[v]))
            nb_score_dict[k] =linear_weights[v]

In [None]:
nb_score_dict = {}
for k, v in X_train_vec.vocabulary_.items():
        if k in word2tfidf.keys():
            nb_score_dict[k] = linear_weights[v]

In [None]:
2+2

In [None]:
#view az data by business with most reviews
az_biz.sort_values('review_count', ascending=False)
# pSQFynH1VxkfSmehRXlZWw 

In [72]:
#run the same pipeline on a single business
single_business = x_df[x_df['business_id'] == 'pSQFynH1VxkfSmehRXlZWw']
sb_x = single_business['text']
y_sb = single_business['binary_stars']

count_vect = CountVectorizer(stop_words='english', min_df=2, ngram_range=(1, 2))
sb_xtrain_vec = CountVectorizer(stop_words='english', min_df=2, ngram_range=(1, 2)).fit(sb_x)
sb_xtrain_counts = count_vect.fit_transform(sb_x)
sb_xtrain_counts.shape

tfidf_transformer = TfidfTransformer()
sb_xtrain_tfidf = tfidf_transformer.fit_transform(sb_xtrain_counts)
sb_xtrain_tfidf.shape

nb_sb = MultinomialNB()
nb_sb.fit(sb_xtrain_tfidf, y_sb)

linear_weights_sb = nb_sb.feature_log_prob_[1,] - nb_sb.feature_log_prob_[0,]  # populate this with actual values

top_negative_features= np.argsort(linear_weights_sb)[0:10]
top_positive_features= np.argsort(linear_weights_sb)[-10:]

print("Most negative features:")
for idx in top_negative_features:
    for k, v in sb_xtrain_vec.vocabulary_.items():
        if v == idx:
            print("  {:s} ({:.02f})".format(k, linear_weights_sb[v]))
            
print("")
print("Most positive features:")
for idx in top_positive_features:
    for k, v in sb_xtrain_vec.vocabulary_.items():
        if v == idx:
            print("  {:s} ({:.02f})".format(k, linear_weights_sb[v]))

Most negative features:
  rude (-1.66)
  overrated (-1.50)
  worst (-1.39)
  burned (-1.36)
  mediocre (-1.34)
  attitude (-1.26)
  overpriced (-1.25)
  disappointment (-1.24)
  tasteless (-1.20)
  average pizza (-1.18)

Most positive features:
  margherita (1.73)
  excellent (1.74)
  loved (1.86)
  fresh (1.86)
  rosa (1.88)
  favorite (1.90)
  amazing (1.95)
  fantastic (1.96)
  perfect (2.10)
  delicious (2.23)


In [None]:
len(linear_weights_sb)
sb_xtrain_vec.vocabulary_.items()

In [None]:
len(linear_weights_sb)


In [None]:
len(linear_weights)


In [None]:
X_train_vec.vocabulary_.items()

In [71]:
n = 30
diff_dict = {}
top_n_positive_features= np.argsort(linear_weights_sb)[-n:]

for idx in top_n_positive_features:
    for a, b in sb_xtrain_vec.vocabulary_.items(): 
        if b == idx:
            for j, k in X_train_vec.vocabulary_.items():
                if j == a:
                    diff = linear_weights_sb[b]- linear_weights[k]
                    diff_dict[j] = diff
        #                 print(j, diff)
        
import operator
sorted_diff_dict = sorted(diff_dict.items(), key=operator.itemgetter(1))
sorted_diff_dict[-25:]

[('awesome', -0.8684038252389534),
 ('wonderful', -0.8115467990149252),
 ('excellent', -0.642301060147072),
 ('fantastic', -0.6189137928947295),
 ('perfect', -0.5692198201260341),
 ('amazing', -0.5395674559482213),
 ('delicious', -0.42350316877795),
 ('loved', -0.18473832061680717),
 ('love', -0.17753473442383694),
 ('favorite', -0.12668457492905727),
 ('pistachios', -0.08242755909203581),
 ('great', -0.07340085367247262),
 ('enjoyed', -0.05390112384602652),
 ('tasty', -0.0514100613361812),
 ('best', 0.0025272868358445777),
 ('atmosphere', 0.07825571784247654),
 ('biancoverde', 0.13165065149458854),
 ('arugula', 0.4002855932547664),
 ('fresh', 0.45694293096775596),
 ('sonny', 0.5595678051335291),
 ('margherita', 0.8879465223813199),
 ('prepared', 0.9005807505051298),
 ('mozzarella', 0.9354058078367284),
 ('wise', 1.3724854519063552),
 ('rosa', 1.3969290621870503)]

In [None]:
import operator
sorted_diff_dict = sorted(diff_dict.items(), key=operator.itemgetter(1))
sorted_diff_dict[-25:]

In [None]:
len(linear_weights)
# linear_weights[0:10]

In [None]:
#run the same pipeline on a single business
single_business = x_df[(x_df['business_id'] == 'pSQFynH1VxkfSmehRXlZWw')]
sb_x = single_business['text']
y_sb = single_business['binary_stars']

In [None]:
cv = CountVectorizer(stop_words='english')
sb_cv = cv.fit_transform(sb_x)
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(sb_cv)
word2tfidf = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))


In [None]:
weights = np.asarray(tfidf_matrix.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cv.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

In [None]:
# nb_score_dict

# pd.DataFrame(nb_score_dict.items(), columns=['term', 'score'])

s = pd.Series(nb_score_dict, name='score')
s.index.name = 'term'
s.reset_index()
s.head()

In [None]:
nb_df = pd.DataFrame(list(nb_score_dict.items()), columns=['term', 'score'])


In [None]:
biz_out = pd.merge(weights_df, nb_df, on='term', how='left')

In [None]:
biz_out['polarity'] = biz_out['weight'] * biz_out['score']
biz_out.sort_values(by='weight', ascending=False).head(50)

In [None]:
polarity_dict = {}
    
for word, score in word2tfidf.items():
    polarity_score = nb_score_dict.get(word, 0) * score
    polarity_dict[word] = polarity_score


    

In [None]:
import operator
sorted_polarity_dict = sorted(polarity_dict.items(), key=operator.itemgetter(1))
sorted_polarity_dict[-40:]

In [None]:
word2tfidf.items()

In [None]:
X_train_vec.vocabulary_[0:10]

In [None]:
word2tfidf.get('margerhita')
X_train_vec.vocabulary_.get('margerhita')

In [None]:
nb_score_dict = {}
for idx in linear_weights:
    if idx > 0:
        for k, v in X_train_vec.vocabulary_.items():
            if v == idx:
                nb_score[k] = linear_weights[v]


In [None]:
n = 10
top_corpus_features= np.argsort(linear_weights)[-n:]


polarity_dict = {}

for idx in top_corpus_features:
    for word, score in word2tfidf.items():
        if score > 0:
            for j, k in X_train_vec.vocabulary_.items():
                if j == word:
                    polarity_score = k * score
                    polarity_dict[word] = polarity_score


In [None]:
polarity_dict = {}

for word, score in word2tfidf.items():
    if score > 0:
        for j, k in X_train_vec.vocabulary_.items():
            if j == word:
                polarity_score = k * score
                polarity_dict[word] = polarity_score


In [None]:
len(X_train_vec.vocabulary_)
len(linear_weights)

### Practice area below

In [None]:
df.head()

In [None]:
az_biz.head()

In [None]:
vec = CountVectorizer().fit(X_train)

In [None]:
top_positive_features

In [None]:
for idx in top_positive_features:
    for k, v in vec.vocabulary_.items():
        if v == idx:
            print(k)
    