# Yelp Sentiment Text Analysis
This Yelp dataset has information on restaurants (e.g., type of food, price range, etc.) as well as reviews written by patrons. The output variable is the star rating (1-5). I implemented the PMI (Pointwise Mutual Information) approach to sentiment analysis and ran the classification model with the sentiment scores.

In [5]:
import pandas as pd
result = pd.read_csv('Yelp Results.csv')
raw = pd.read_csv('Yelp Data Restaurant Reviews Ratings.csv')

# group ratings high or low
raw['High'] = 0
raw.ix[raw['stars']>3, 'High'] = 1

# take subset
raw = raw[0:705]

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer # this tokenizer to remove punctuation
tokenizer = RegexpTokenizer(r'\w+')

# create a list of documents where stop words are removed
documents = []
for i in range(0, len(raw)):
    doc = raw.iloc[i]["Review"]
    doc2 = tokenizer.tokenize(doc)
    doc3 = [word for word in doc2 if word not in stopwords.words('english')]
    doc = ' '.join(doc3)
    documents.append(doc)



In [7]:
from collections import Counter

# word frequencies for each document
word_frequencies = [Counter(document.split()) for document in documents]

# document frequency
document_frequencies = Counter()
map(document_frequencies.update, (word_frequency.keys() for word_frequency in word_frequencies))

print(document_frequencies)



In [8]:
from collections import defaultdict
 
com = defaultdict(lambda : defaultdict(int))
 
# create matrix that shows number of times word w1 has been seen with w2
for i in range(0, len(raw)):
    doc = raw.iloc[i]["Review"]
    terms = word_tokenize(doc)
    terms_only = [term for term in terms if term not in stopwords.words('english')]
    
    # a word is seen with a word if there are 2 words or less between them
    for i in range(len(terms_only)-3):  
        for j in range(i+1, i+3):
            w1, w2 = sorted([terms_only[i], terms_only[j]])      
            if w1 != w2:
                com[w1][w2] += 1



In [9]:
# count number of cases of positive and negative

positive_vocab = ['good', 'nice', 'great', 'awesome', 'outstanding', 'wonderful', 'love']
negative_vocab = ['hate', 'awful', 'terrible', 'disgusting', 'gross', 'horrible', 'dislike']

n_p = 0.01 # number of cases of positive words
for word, count in document_frequencies.items():
    if word in positive_vocab:
        n_p += count
n_n = 0.01 # number of cases of negative words
for word, count in document_frequencies.items():
    if word in negative_vocab:
        n_n += count
        
print n_p
print n_n

1015.01
44.01


In [10]:
import math

semantic_orientation = {}
for x in document_frequencies:
    pos = 0.01 # counter for number of cases with phrase and positive word
    neg = 0.01 # counter for number of cases with phrase and negative word
    for y in com[x]:
        if y in positive_vocab:
            pos += 1
        elif y in negative_vocab:
            neg += 1
    numer = pos * n_n
    denom = neg * n_p
    semantic_orientation[x] = math.log(numer/denom) / math.log(2)
    
semantic_orientation

{'yellow': -4.527518759126944,
 'four': -4.527518759126944,
 'Does': -4.527518759126944,
 'hanging': -4.527518759126944,
 'Until': -4.527518759126944,
 'looking': 3.123532932051984,
 'LAST': -4.527518759126944,
 'Ronald': -4.527518759126944,
 'lord': -4.527518759126944,
 'SPF': -4.527518759126944,
 'Refreshing': -4.527518759126944,
 'Frankly': -4.527518759126944,
 'propane': -4.527518759126944,
 'regional': -4.527518759126944,
 'eggrolls': -4.527518759126944,
 'Pineapples': -4.527518759126944,
 'leisurely': -4.527518759126944,
 'bringing': 2.1306927236248505,
 'disturb': -4.527518759126944,
 'prize': -4.527518759126944,
 'wooden': -4.527518759126944,
 'clientele': -4.527518759126944,
 'tabletops': -4.527518759126944,
 'lime': -4.527518759126944,
 'solid': -4.527518759126944,
 'woods': -4.527518759126944,
 'Smallish': -4.527518759126944,
 'commented': -4.527518759126944,
 'Screw': -4.527518759126944,
 'specially': -4.527518759126944,
 'tired': -4.527518759126944,
 'scrapes': -4.52751875

In [11]:
# calculating average sentiment for each review

test = raw[:100] # testing it out first

def func(x):
    score = 0.0
    total = 0.0
    doc = word_tokenize(x)
    for wordx in doc:
        for wordy, semantic in semantic_orientation.items():
            if wordx == wordy:
                score += semantic
                total += 1
    return score/total
    score = 0.0
    total = 0.0        

test['Avg'] = test['Review'].apply(func)
 
test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,stars,votes_cool,votes_funny,votes_useful,Cheap,Moderate,Expensive,VeryExpensive,American,Chinese,...,Italian,Greek,Mediterranean,Mexican,Thai,Vietnamese,Others,Review,High,Avg
0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,This location is out of business. I drove by i...,0,-3.483027
1,2,2,2,2,1,0,0,0,0,0,...,0,0,0,0,0,0,1,= = = = = = CLOSED = = = = = =This JB s locati...,0,-2.205522
2,4,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,This is just a basic (albeit mini) chain greas...,1,-3.017402
3,3,0,1,2,1,0,0,0,0,0,...,0,0,0,0,0,0,1,Whenever I offer to take my mom out to lunch s...,0,-2.571338
4,3,7,9,9,1,0,0,0,0,0,...,0,0,0,0,0,0,1,If I say it wasn t as bad as I was expecting i...,0,-1.910531
5,2,0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,I ve always said if the guacamole chips and s...,0,-2.120166
6,4,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,Had the signature Black Chile entree. It was ...,1,-0.692864
7,3,1,1,4,0,1,0,0,0,0,...,0,0,0,1,0,0,0,After hitting up the bank to sign some paper w...,0,-2.495713
8,4,3,2,3,0,1,0,0,0,0,...,0,0,0,1,0,0,0,Great happy hour deals here! I loved the Cotij...,1,-0.183125
9,3,2,0,4,0,1,0,0,0,0,...,0,0,0,1,0,0,0,Fine. Just fine. C+/B- average-- all around. ...,0,-2.151837
