In [2]:
import pandas as pd

### Train Dataset

In [3]:
ds = pd.read_csv("data/train/small.csv")

In [4]:
ds.head(4)

Unnamed: 0,_id,business_id,review_id,stars,text,useful
0,5ad77a48d209ba1dce460064,eLFfWcdb7VkqNyTONksHiQ,7ECYfYWSx5_SEIYsDiCZ1A,5,"Great all you can eat Korean BBQ. Very clean, ...",1
1,5ad77a94d209ba1dce5c7096,Yp9w4nhUowBU_IS_StFXbQ,uBkU8wLvQCCCL0TZ8ESLaw,5,This place is sort of hidden. It's behind the...,1
2,5ad77b87d209ba1dce91f888,5md4i4diIWMNYQgHMj_Nbg,4xlVPqc8nuEWeIQYCL0FEA,5,The hoover dam kayaking tour was great. No pro...,0
3,5ad77aead209ba1dce6dccd7,XBEx2FI2GKmsjs7eWNr7Lw,CyaJqWU8bmaEPLWgxekAOg,5,"Awesome acai bowls. As a cheesecake lover, I w...",1


### Test Dataset

In [6]:
TEST_FILE = 'data/test/10.json'

import json
with open(TEST_FILE,'r') as json_file:    
    data = json.load(json_file)

count = 0 
for review in data["yelp"][0]["reviews"]:
    print("-----------------------------------------------------")
    print(review["text"])
    print("Rating: ",review["ratings"])
    count += 1
    if count == 3:
        break

### Other Variations of Train Set in size

In [8]:
d = pd.read_csv("data/train/50K.csv")
d.shape

(50000, 6)

In [9]:
d = pd.read_csv("data/train/10K.csv")
d.shape

(10000, 6)

In [10]:
d["indicator"] = d["useful"]  * d["stars"] +1

In [11]:
d.head()

Unnamed: 0,_id,business_id,review_id,stars,text,useful,indicator
0,5ad77a66d209ba1dce5030b2,kcF-en40P3J9L8nLy5lKqQ,XJEz7zcJzwMWyoAv22hIvQ,5,Everything about Central Bistro was fantastic!...,3,16
1,5ad77b05d209ba1dce748580,ZibmYdOPKLlqDM9oR6xzOA,RX_u02e8FSaGeV2CEP5B9g,4,"Went before 6pm, before the dinner crowd and h...",0,1
2,5ad77b1fd209ba1dce7a37dd,LXrSnbUlEnw_fI7A2Dpp9Q,Jc3j0FGT_KWXhA_UWgj9eg,4,I shop at Nordstrom somewhat regularly.\n\nA b...,12,49
3,5ad77ac3d209ba1dce66dbde,66jwLkSd6M0cZulZrA3zcw,EMIWAZ_WKD-_d4VQOEQRQg,5,"Came for dinner, it happened to be late night ...",0,1
4,5ad77abfd209ba1dce65da1c,hhLKp8ilQac7M5UTTpnPnQ,V6KzjQzd38q_1i3Ghpurzw,5,I am VERY particular about getting my nails do...,5,26


REMOVE STOPWORDS

In [54]:
from nltk.corpus import stopwords
def clean_text(text, remove_stopwords = True):
    
    
    # Convert words to lower case
    text = text.lower()
    
# A Python library for expanding and creating common English contractions in text. 
#This is very useful for dimensionality reduction by normalizing the text before generating word or character vectors. 
#It performs contraction by simple replacement rules of the commonly used English contractions.
#Replace contractions with their longer forms 
    
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [55]:
clean_text("I love this place")
    

'love place'

### Section 2 : Cleaning the Data
Pre-requirements

<p> Initiate the command <span style="color:red"> pip install TextBlob </span> from the terminal </p>
- http://textblob.readthedocs.io/en/dev/

<pre>
**********************************************************************
  Resource punkt not found.
  Please use the NLTK Downloader to obtain the resource:

  >>> import nltk
  >>> nltk.download('punkt')
**********************************************************************
</pre>

In [56]:
from textblob import TextBlob
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/pannaga/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to /Users/pannaga/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [57]:
ds.head(4)

Unnamed: 0,_id,business_id,review_id,stars,text,useful
0,5ad77a48d209ba1dce460064,eLFfWcdb7VkqNyTONksHiQ,7ECYfYWSx5_SEIYsDiCZ1A,5,"Great all you can eat Korean BBQ. Very clean, ...",1
1,5ad77a94d209ba1dce5c7096,Yp9w4nhUowBU_IS_StFXbQ,uBkU8wLvQCCCL0TZ8ESLaw,5,This place is sort of hidden. It's behind the...,1
2,5ad77b87d209ba1dce91f888,5md4i4diIWMNYQgHMj_Nbg,4xlVPqc8nuEWeIQYCL0FEA,5,The hoover dam kayaking tour was great. No pro...,0
3,5ad77aead209ba1dce6dccd7,XBEx2FI2GKmsjs7eWNr7Lw,CyaJqWU8bmaEPLWgxekAOg,5,"Awesome acai bowls. As a cheesecake lover, I w...",1


First three review text

In [58]:
sample_reviews = ds.loc[:5, "text"]

In [59]:
sample_reviews[0]

'Great all you can eat Korean BBQ. Very clean, good food, our server Jay-Ar was great and very attentive to his customers. I would highly recommend this place.'

In [60]:
for review in sample_reviews:
    r_meta = TextBlob(review)
    print("==========================Tags=============================")
    print(r_meta.tags)
    print("==========================Nouns=============================")
    print(r_meta.noun_phrases)
    print("==========================Sentiment :) / :( =============================")
    
    

[('Great', 'NNP'), ('all', 'DT'), ('you', 'PRP'), ('can', 'MD'), ('eat', 'VB'), ('Korean', 'JJ'), ('BBQ', 'NNP'), ('Very', 'RB'), ('clean', 'JJ'), ('good', 'JJ'), ('food', 'NN'), ('our', 'PRP$'), ('server', 'JJ'), ('Jay-Ar', 'NNP'), ('was', 'VBD'), ('great', 'JJ'), ('and', 'CC'), ('very', 'RB'), ('attentive', 'JJ'), ('to', 'TO'), ('his', 'PRP$'), ('customers', 'NNS'), ('I', 'PRP'), ('would', 'MD'), ('highly', 'RB'), ('recommend', 'VB'), ('this', 'DT'), ('place', 'NN')]
['korean bbq', 'good food', 'jay-ar']
[('This', 'DT'), ('place', 'NN'), ('is', 'VBZ'), ('sort', 'NN'), ('of', 'IN'), ('hidden', 'NN'), ('It', 'PRP'), ("'s", 'VBZ'), ('behind', 'IN'), ('the', 'DT'), ('tower', 'NN'), ('suites', 'VBZ'), ('so', 'RB'), ('you', 'PRP'), ('kind', 'NN'), ('of', 'IN'), ('have', 'NN'), ('to', 'TO'), ('look', 'VB'), ('for', 'IN'), ('it', 'PRP'), ('on', 'IN'), ('the', 'DT'), ('map', 'NN'), ('or', 'CC'), ('ask', 'VB'), ('the', 'DT'), ('workers', 'NNS'), ('to', 'TO'), ('lead', 'VB'), ('you', 'PRP'), ('

#### Sentimental Analysis on some sample review

In [61]:
for review in sample_reviews:
    if TextBlob(review).sentiment.polarity > 0.5:
        print("Happy ",TextBlob(review).sentiment.polarity*100)
    else:
        print("Sad",TextBlob(review).sentiment.polarity*100)

Happy  57.611111111111114
Sad 16.602564102564102
Sad 42.1875
Sad 36.66666666666667
Sad 25.659340659340657
Sad 29.746031746031743


list of todos
- correct the spelling on all reviews as much as possible
- Apply Naive bayes

<p style="color:red"> learn the below topics from http://textblob.readthedocs.io/en/dev/quickstart.html </p>
Tutorial: Quickstart

    Create a TextBlob
    Part-of-speech Tagging
    Noun Phrase Extraction
    Sentiment Analysis
    Tokenization
    Words Inflection and Lemmatization
    WordNet Integration
    WordLists
    Spelling Correction
    Get Word and Noun Phrase Frequencies
    Translation and Language Detection
    Parsing
    TextBlobs Are Like Python Strings!
    n-grams
    Get Start and End Indices of Sentences




In [62]:
# example for spelling correction
b = TextBlob("they havv good chicken!")
b.correct()

TextBlob("they have good chicken!")

### Section 3: Classification of Model

example http://textblob.readthedocs.io/en/dev/classifiers.html#classifiers

In [63]:
 >>> train = [
...     ('I love this sandwich.', 'pos'),
...     ('this is an amazing place!', 'pos'),
...     ('I feel very good about these beers.', 'pos'),
...     ('this is my best work.', 'pos'),
...     ('what an awesome view', 'pos'),
...     ('I do not like this restaurant', 'neg'),
...     ('I am tired of this stuff.', 'neg'),
...     ("I can't deal with this", 'neg'),
...     ('he is my sworn enemy!', 'neg'),
...     ('my boss is horrible.', 'neg'),
        ('This place is good.','pos')
... ]
>>> test = [
...     ('the beer was good.', 'pos'),
...     ('I do not enjoy my job', 'neg'),
...     ("I ain't feeling dandy today.", 'neg'),
...     ("I feel amazing!", 'pos'),
...     ('Gary is a friend of mine.', 'pos'),
...     ("I can't believe I'm doing this.", 'neg')
        
... ]

In [64]:
>>> from textblob.classifiers import NaiveBayesClassifier
>>> cl = NaiveBayesClassifier(train)

In [65]:
cl.classify("This restaurant is good.")

'pos'

In [66]:
cl.classify("pasta was ")

'pos'

In [67]:
cl.show_informative_features()  

Most Informative Features
            contains(my) = True              neg : pos    =      1.9 : 1.0
             contains(I) = True              neg : pos    =      1.6 : 1.0
             contains(I) = False             pos : neg    =      1.5 : 1.0
         contains(place) = False             neg : pos    =      1.4 : 1.0
          contains(good) = False             neg : pos    =      1.4 : 1.0
            contains(an) = False             neg : pos    =      1.4 : 1.0
            contains(my) = False             pos : neg    =      1.3 : 1.0
         contains(tired) = False             pos : neg    =      1.2 : 1.0
      contains(horrible) = False             pos : neg    =      1.2 : 1.0
          contains(boss) = False             pos : neg    =      1.2 : 1.0


APPLYING THIS CLASSIFICATION TO DATASET:

In [68]:
def create_ds_train():
    ds_train = [] # empty list 
    for index, row in ds.iterrows():
        if row["stars"] >= 4 :
            ds_train.append((clean_text(TextBlob(row["text"]).correct()), 'pos')) # append 
        else:
            ds_train.append((clean_text(TextBlob(row["text"]).correct()), 'neg'))
    return ds_train

In [69]:
from textblob.classifiers import NaiveBayesClassifier
cl = NaiveBayesClassifier(create_ds_train())

In [71]:
cl.show_informative_features() 

Most Informative Features
         contains(great) = True              pos : neg    =      6.0 : 1.0
      contains(friendly) = True              pos : neg    =      5.7 : 1.0
      contains(terrible) = True              neg : pos    =      5.7 : 1.0
           contains(pay) = True              neg : pos    =      5.7 : 1.0
             contains(o) = True              neg : pos    =      5.7 : 1.0
      contains(customer) = True              neg : pos    =      5.0 : 1.0
           contains(due) = True              neg : pos    =      4.5 : 1.0
         contains(hotel) = True              neg : pos    =      4.5 : 1.0
         contains(check) = True              neg : pos    =      4.5 : 1.0
       contains(weekend) = True              neg : pos    =      4.5 : 1.0
