# Load, process Yelp review data

In [27]:
import json
reviews_path = 'yelp_academic_dataset_review.json'
with open(reviews_path) as f:  # 5.9G
    reviews = [json.loads(next(f)) for x in range(int(1e7))]
len(reviews)

StopIteration: 

In [2]:
outlines = []
count = 1
for review in reviews:
    if review['stars'] >= 4:
        rating = 'positive'
    elif review['stars'] <= 2:
        rating = 'negative'
    else:
        continue
    outlines.append([review['text'], rating])
    count += 1
    if count > 10000:
        break
len(outlines)

10000

In [3]:
# Save out condensed data
import pandas as pd
data = pd.DataFrame(outlines, columns=['review_text', 'rating'])
outpath = 'review_sample.csv'
data.to_csv(outpath)

# Sample Yelp review data

In [2]:
# Load review data
import pandas as pd
path = 'review_sample10000.csv'
data = pd.read_csv(path, index_col=0)
data

Unnamed: 0,review_text,rating
0,"As someone who has worked with many museums, I...",negative
1,I am actually horrified this place is still in...,negative
2,I love Deagan's. I do. I really do. The atmosp...,positive
3,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",negative
4,"Oh happy day, finally have a Canes near my cas...",positive
...,...,...
9995,Amazing food. Glorious bevs. What more could y...,positive
9996,Wife and I have been going to Abuelos for year...,negative
9997,I had THE BEST VEGAN Gardein chicken plate wit...,positive
9998,Went there for the first time today and got ve...,positive


In [6]:
# Balanced sampling between negative and positive ratings
sample = pd.concat([data[data['rating']=='positive'].sample(500), data[data['rating']=='negative'].sample(500)])
sample = sample.sample(frac=1) # shuffle
sample['rating'].value_counts()

negative    500
positive    500
Name: rating, dtype: int64

In [7]:
# Save out
outpath = 'yelp_reviews_1000balanced.csv'
sample.to_csv(outpath)

# Train sentiment classifier

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
test_size = int(0.1 * len(data))
text_train, text_test, y_train, y_test = train_test_split(data['review_text'], data['rating'], test_size=test_size, random_state=7)
corpus = data['review_text']
vectorizer = TfidfVectorizer(min_df=1)
x_train = vectorizer.fit_transform(text_train) # corpus is a list of strings (documents)
x_test = vectorizer.transform(text_test) # corpus is a list of strings (documents)
print(x_train.shape)
print(x_test.shape)

(9000, 24634)
(1000, 24634)


In [14]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear')
clf.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
clf.score(x_test, y_test)

0.911

# Try on AI experiences data

## From Yelp-trained logistic regression classifier

In [20]:
# path = '/home/mamille2/storyq/ai_experiences.csv'
path = 'ai_experiences.csv'
experiences = pd.read_csv(path)
experiences

Unnamed: 0,Text,Label,Shiyan,Cansu,Jie,Madeline,Criteria,Other Notes,Unnamed: 8
0,"Siri knows the true meaning of Christmas, offe...",Positive,Positive as it's a good experience although th...,"I thought it might be neutral, but then after ...",Started with a negative sacstic crique but lat...,"Positive: Humor of experience, and knowing a l...",,Weight towards what sentence?,
1,One of the most common interactions I have wit...,Positive,Neutral as it's a state of fact and it feels l...,I think the participant appreciates the opport...,The writer had a mixture of feelings toward AI...,"Negative: Sense of ""too comfortable"" and too g...",If there is a transition from one side to the ...,,
2,I remember one day while I was in graduate sch...,Positive,The negative part comes from the writer's frie...,"I think it is a funny story and in general, po...",Positive because AI technology seemed to be us...,Positive: The humor involved in pranks; AI as ...,"AI technology itself, not the people using it","Humor as positive, involving minor misfortune;...",
3,"Last week, my family was talking about winter ...",Still debating,It tends to be negative as the writer did not ...,Positive; at first the attitude is negative bu...,"Negative. Surprised, felt somewhat creepy, end...","Weird coincidence, so mildly negative; people ...",Would this person want to use AI or not? (Cansu),Making inferences is a human thing; consistent...,No agreement on last quote
4,My Apple watch counts each hour in which I hav...,,,,,,Would this person have this experience again? ...,,
5,One of the most frustrating encounters with AI...,,,,,,,,
6,Facebook tries to be helpful. Whenever I am o...,,,,,,,,
7,"Yesterday, I chatted with Emma, an agent that ...",,,,,,,,
8,"I, like most people, use Google search several...",,,,,,,,
9,I love using navigation apps to get places. Es...,,,,,,,,


In [21]:
corpus = experiences['Text']
bow = vectorizer.transform(corpus) # corpus is a list of strings (documents)
bow.shape

(14, 24634)

In [24]:
experiences['yelp10k_classifier'] = clf.predict(bow)
experiences.loc[:, ['Text', 'yelp10k_classifier']]

Unnamed: 0,Text,yelp10k_classifier
0,"Siri knows the true meaning of Christmas, offe...",positive
1,One of the most common interactions I have wit...,positive
2,I remember one day while I was in graduate sch...,negative
3,"Last week, my family was talking about winter ...",positive
4,My Apple watch counts each hour in which I hav...,negative
5,One of the most frustrating encounters with AI...,positive
6,Facebook tries to be helpful. Whenever I am o...,positive
7,"Yesterday, I chatted with Emma, an agent that ...",negative
8,"I, like most people, use Google search several...",positive
9,I love using navigation apps to get places. Es...,positive


In [25]:
experiences['yelp10k_classifier']

0     positive
1     positive
2     negative
3     positive
4     negative
5     positive
6     positive
7     negative
8     positive
9     positive
10    positive
11    positive
12    positive
13    negative
Name: yelp10k_classifier, dtype: object

## Show top features

In [30]:
import numpy as np
def print_top_features(vectorizer, clf, n=20):
    """Prints features with the highest coefficient values"""
    feature_names = vectorizer.get_feature_names()
    top_indices = np.argsort(clf.coef_[0])[-1*n:]
    print("\n".join(reversed([feature_names[j] for j in top_indices])))

In [32]:
import numpy as np
def print_bottom_features(vectorizer, clf, n=20):
    """Prints features with the highest coefficient values"""
    feature_names = vectorizer.get_feature_names()
    top_indices = np.argsort(clf.coef_[0])[:n]
    print("\n".join(reversed([feature_names[j] for j in top_indices])))

In [31]:
print_top_features(vectorizer, clf)

great
amazing
delicious
love
best
and
awesome
definitely
friendly
excellent
always
perfect
good
nice
little
loved
fantastic
very
highly
with


In [33]:
print_bottom_features(vectorizer, clf)

slow
then
won
left
poor
nothing
money
bad
over
asked
told
ok
bland
horrible
never
terrible
rude
worst
no
not


In [None]:
def get_informative_features(features_vectorizer, model, model_name, data_dirpath, n=10000):
    feats_index2name = {v: k for k, v in features_vectorizer.vocabulary_.items()}
    feature_weights = model.coef_[0]
    
    top_indices = np.argsort(feature_weights)[-1*n:]
    top_weights = np.sort(feature_weights)[-1*n:]
    bottom_indices = np.argsort(feature_weights)[:n]
    bottom_weights = np.sort(feature_weights)[:n]

    nontag_lines = [] # to sort and print
    lines = [] # to sort and print
    
    for i, (j, w) in enumerate(zip(reversed(top_indices), reversed(top_weights))):
        feature_name = feats_index2name[j]
        if not feature_name.startswith('tag'):
            nontag_lines.append([i, feature_name, w, abs(w)])
#             print(f"{i}\t{feature_name}\t{w: .3f}")
        lines.append([i, feature_name, w, abs(w)])
    
    for i, (j, w) in enumerate(zip(bottom_indices, bottom_weights)):
        feature_name = feats_index2name[j]
        if not feature_name.startswith('tag'):
            nontag_lines.append([i, feature_name, w, abs(w)])
        lines.append([i, feature_name, w, abs(w)])


## From NLTK SentimentAnalyzer

In [17]:
from nltk.sentiment import SentimentAnalyzer

AttributeError: module 'scipy' has no attribute '_lib'