<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Explore-NLP-methods/vizs-with-champaign-restaurant-review-data" data-toc-modified-id="Explore-NLP-methods/vizs-with-champaign-restaurant-review-data-1">Explore NLP methods/vizs with champaign restaurant review data</a></span><ul class="toc-item"><li><span><a href="#Get-data" data-toc-modified-id="Get-data-1.1">Get data</a></span></li><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-1.2">Preprocessing</a></span><ul class="toc-item"><li><span><a href="#Data-preview" data-toc-modified-id="Data-preview-1.2.1">Data preview</a></span></li><li><span><a href="#Tokenize" data-toc-modified-id="Tokenize-1.2.2">Tokenize</a></span></li><li><span><a href="#Make-dictionary-&amp;-corpus" data-toc-modified-id="Make-dictionary-&amp;-corpus-1.2.3">Make dictionary &amp; corpus</a></span></li></ul></li></ul></li></ul></div>

# Explore NLP methods/vizs with champaign restaurant review data

In [1]:
#data 
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
import pandas as pd

#spacy for tokenization
from spacy.lang.en import English # Create the nlp object
import spacy
#gensim for similarity
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.similarities.docsim import MatrixSimilarity,Similarity

#sklearn for distance metrics
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Import the English language class
from spacy.lang.en import English # Create the nlp object
nlp = English()

#Doc

 # Created by processing a string of text with the nlp object
doc = nlp("Hello world!")

for token in doc:
    # Iterate over tokens in a Doc for token in doc:
    print(token.text)



#Span 

# A slice from the Doc is a Span object
span = doc[1:4]
# Get the span text via the .text attribute print(span.text)


#Lexical attributes
doc = nlp("It costs $5.")
print('Index: ', [token.i for token in doc]) print('Text: ', [token.text for token in doc])
print('is_alpha:', [token.is_alpha for token print('is_punct:', [token.is_punct for token print('like_num:', [token.like_num for token
in doc]) in doc]) in doc])

#Model packages
##models to predict part of speech,named entities etc with context


#Part of speech#
# Load the small English model
nlp = spacy.load('en_core_web_sm') # Process a text
doc = nlp("She ate the pizza")
# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech tag print(token.text, token.pos_)
    print(token.text, token.pos_)

#Synactic Dependencies#
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)
    
#Named Entity#
# Process a text
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion") # Iterate over the predicted entities
for ent in doc.ents:
# Print the entity text and its label
print(ent.text, ent.label_)


#Get defnitions of tags

spacy.explain('NNP')


## Get data

In [5]:
postgres_db_params = {'drivername': 'postgres',
                'database':'restaurants',
               'username': 'michaelkranz',
               'password': 'helloworld',
               'host': 'localhost',
               'port': 5432}

postgres_db_url = URL(**postgres_db_params)
engine = create_engine(postgres_db_url)

In [6]:
review_df = pd.read_sql(
    con=engine.connect(),
    sql='''
    SELECT *
    FROM champaign_restaurant_reviews
    ''')

## Preprocessing

### Data preview

In [5]:
review_df.head(2)

Unnamed: 0,business_id,name,review_id,user_id,text,stars,date
0,9A1C1f0m4nQltQrOOTl-Kw,Orange & Brew,m4AXzV9l14iFBd9DRdM82w,6X0i-oGUbh5DZdTHzFuKfg,The building is lovely. The remodel after But...,1.0,2013-12-07 02:26:13
1,VHsNB3pdGVcRgs6C3jt6Zg,Dublin O'Neil's,A-yKlSLEQQcoHR5q2lCyHg,Yximlvn0cfb3yVDaLuXDxw,LOVE LOVE LOVE this place! I'm a bit of a suck...,5.0,2013-08-03 19:59:56


In [23]:
review_df.head(2).text.values[0]

'The building is lovely.  The remodel after Buttita\'s is as good as one can do when turning a lovely restaurant into a sports bar-style restaurant (though why one would want to do that is beyond me).  And that\'s where the good stuff stops.\nIn short, the service and all things related to it were glacially slow; the food, when it arrived, was mediocre at best -- and missing key elements (one whole order, and other parts).  This restaurant cannot possibly last if they don\'t figure out how to greet and serve customers, or get the food that was ordered actually to the tables.\nThe full story.\nUpon arrival, we found a vacant hostess stand, then a long walk down the hall to the dining room along which we saw no one who works there.  Luckily, we ran into friends who assured us that we could seat ourselves.\nAfter waiting at our table for some 15 minutes, we used a bit of self-help, and accosted a waitress to ask for some menus.  After another 5-7 minutes another waitress finally showed up

In [8]:
first_review=review_df.text.iloc[0]

### Tokenize

In [134]:
def tokenize_text(text_str,nlp_obj):
    '''
    use spacy to separate text into words
    (ie tokenization)
    and return the lemmatization 
    (ie feet for footing and foot)
    for only nouns and adjectives
    
    TODO: refine methodology
    '''
    spacy_doc = nlp_obj(text_str)
    
    tokenized_doc = [
        token.lemma_
        for token in spacy_doc
        if token.pos_ in ("NOUN","ADJ")
        ]
    
    return tokenized_doc
    #return spacy_doc
        

In [56]:
nlp = English()

In [82]:
nlp = spacy.load('en_core_web_sm')

In [36]:
doc = nlp(first_review)

In [37]:
# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Buttita PERSON
one CARDINAL
some 15 minutes TIME
5-7 minutes TIME
three CARDINAL
1 CARDINAL
one CARDINAL
one CARDINAL
about 7 minutes TIME
10 CARDINAL
3 minutes TIME
Ten minutes TIME
Hardee PERSON


In [38]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

### Make dictionary & corpus

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
#TODO: combine reviews in SQL to scale

In [165]:
reviews_dictionary = Dictionary()

In [179]:
reviews_tokenized = (
    review_df
    #.head(2)
    .groupby('business_id')
    .text
    .apply(lambda x: ' '.join(x))
    .apply(tokenize_text,nlp_obj=nlp)
)

In [182]:
reviews_tokenized.head(3)

business_id
-2q4dnUw0gGJniGW2aPamQ    [girlfriend, place, diner, full, unofficial, w...
-5NXoZeGBdx3Bdk70tuyCw    [amazing, tender, pork, sandwich, good, homema...
-5dd-RjojGVK9hjAMCXVZw    [great, restaurant, inexpensive, american, mex...
Name: text, dtype: object

In [183]:
reviews_dictionary = Dictionary(reviews_tokenized)

In [184]:
reviews_dictionary.num_docs

701

In [185]:
review_df.business_id.unique().shape

(701,)

In [186]:
#corpus
reviews_corpus = [reviews_dictionary.doc2bow(doc) for doc in reviews_tokenized]

In [188]:
#tfidf with document being each restaurant and corpus being all restaurants
reviews_tfidf = TfidfModel(reviews_corpus)

In [190]:
reviews_tfidf

TypeError: 'TfidfModel' object is not callable

In [None]:
#similarity

In [None]:
#ratings

#location