<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Explore-NLP-methods/vizs-with-champaign-restaurant-review-data" data-toc-modified-id="Explore-NLP-methods/vizs-with-champaign-restaurant-review-data-1">Explore NLP methods/vizs with champaign restaurant review data</a></span><ul class="toc-item"><li><span><a href="#Get-data" data-toc-modified-id="Get-data-1.1">Get data</a></span></li><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-1.2">Preprocessing</a></span><ul class="toc-item"><li><span><a href="#Data-preview" data-toc-modified-id="Data-preview-1.2.1">Data preview</a></span></li><li><span><a href="#Tokenize" data-toc-modified-id="Tokenize-1.2.2">Tokenize</a></span></li><li><span><a href="#Make-dictionary-&amp;-corpus" data-toc-modified-id="Make-dictionary-&amp;-corpus-1.2.3">Make dictionary &amp; corpus</a></span></li></ul></li></ul></li></ul></div>

# Explore NLP methods/vizs with champaign restaurant review data

In [32]:
#data 
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
import pandas as pd

#spacy for tokenization
from spacy.lang.en import English # Create the nlp object
import spacy
#gensim for similarity
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.similarities.docsim import MatrixSimilarity,Similarity

#sklearn for distance metrics
from sklearn.metrics.pairwise import cosine_similarity

## Get data

In [33]:
postgres_db_params = {'drivername': 'postgres',
                'database':'restaurants',
               'username': 'michaelkranz',
               'password': 'helloworld',
               'host': 'localhost',
               'port': 5432}

postgres_db_url = URL(**postgres_db_params)
engine = create_engine(postgres_db_url)

In [34]:
review_df = pd.read_sql(
    con=engine.connect(),
    sql='''
    SELECT *
    FROM champaign_restaurant_reviews
    ''')

## Preprocessing

### Data preview

In [35]:
review_df.head(2)

Unnamed: 0,business_id,name,review_id,user_id,text,stars,date
0,9A1C1f0m4nQltQrOOTl-Kw,Orange & Brew,m4AXzV9l14iFBd9DRdM82w,6X0i-oGUbh5DZdTHzFuKfg,The building is lovely. The remodel after But...,1.0,2013-12-07 02:26:13
1,VHsNB3pdGVcRgs6C3jt6Zg,Dublin O'Neil's,A-yKlSLEQQcoHR5q2lCyHg,Yximlvn0cfb3yVDaLuXDxw,LOVE LOVE LOVE this place! I'm a bit of a suck...,5.0,2013-08-03 19:59:56


In [36]:
review_df.head(2).text.values[0]

'The building is lovely.  The remodel after Buttita\'s is as good as one can do when turning a lovely restaurant into a sports bar-style restaurant (though why one would want to do that is beyond me).  And that\'s where the good stuff stops.\nIn short, the service and all things related to it were glacially slow; the food, when it arrived, was mediocre at best -- and missing key elements (one whole order, and other parts).  This restaurant cannot possibly last if they don\'t figure out how to greet and serve customers, or get the food that was ordered actually to the tables.\nThe full story.\nUpon arrival, we found a vacant hostess stand, then a long walk down the hall to the dining room along which we saw no one who works there.  Luckily, we ran into friends who assured us that we could seat ourselves.\nAfter waiting at our table for some 15 minutes, we used a bit of self-help, and accosted a waitress to ask for some menus.  After another 5-7 minutes another waitress finally showed up

### Tokenize

In [37]:
def tokenize_text(text_str,nlp_obj):
    '''
    use spacy to separate text into words
    (ie tokenization)
    and return the lemmatization 
    (ie feet for footing and foot)
    for only nouns and adjectives
    
    TODO: refine methodology
    '''
    spacy_doc = nlp_obj(text_str)
    
    tokenized_doc = [
        token.lemma_
        for token in spacy_doc
        if token.pos_ in ("NOUN","ADJ")
        ]
    
    return tokenized_doc
    #return spacy_doc
        

### Make dictionary & corpus

In [38]:
nlp = spacy.load('en_core_web_sm')

In [39]:
#TODO: combine reviews in SQL to scale
#TODO: sciktilearn?

In [40]:
reviews_tokenized = (
    review_df
    #.head(2)
    .groupby('business_id')
    .text
    .apply(lambda x: ' '.join(x))
    .apply(tokenize_text,nlp_obj=nlp)
)

In [244]:
review_df.query("business_id=='-5NXoZeGBdx3Bdk70tuyCw'")

Unnamed: 0,business_id,name,review_id,user_id,text,stars,date
6955,-5NXoZeGBdx3Bdk70tuyCw,Po' Boys Restaurant,TvUR-ch3LDC-s5dijdRaEw,ohzAcptjBMwwHE0LHX-ksw,Amazing tender pork sandwich and the best hom...,5.0,2013-06-24 16:48:29
6982,-5NXoZeGBdx3Bdk70tuyCw,Po' Boys Restaurant,H2qoduHY7-mlBYyLGHZhBA,CAYavn4JlhvTnUGAhThZcQ,Great pulled pork sandwich!! Nice and tender t...,5.0,2013-11-14 19:10:01
7035,-5NXoZeGBdx3Bdk70tuyCw,Po' Boys Restaurant,b-jiUVMjU8zb5wFu7Xpc9w,1L7GAHPqoamipAmnIhP6vg,Holy cow! This place is amazing! Went there fo...,5.0,2014-09-27 12:55:57
7156,-5NXoZeGBdx3Bdk70tuyCw,Po' Boys Restaurant,rZQDJ1DUT3h9-44wYuYkWQ,ZzZa6mGTpFzoIIhV_fxQ-w,Po' Boys has become our regular Friday night s...,5.0,2018-06-09 22:02:52
7197,-5NXoZeGBdx3Bdk70tuyCw,Po' Boys Restaurant,C2LPyYAPUuCAXnqlxxWzPQ,9qAt7wTgl6mYtxPGZGwyWw,The hell are they thinking with their prices?\...,2.0,2016-09-20 22:49:28
...,...,...,...,...,...,...,...
9510,-5NXoZeGBdx3Bdk70tuyCw,Po' Boys Restaurant,rwKW4hbpVLamxJBPMJHhAA,V-BQs5k8zUCluBPWxzY5JQ,Po Boys is the place for BBQ and Pizza!!\nIts ...,4.0,2012-03-20 20:52:23
9552,-5NXoZeGBdx3Bdk70tuyCw,Po' Boys Restaurant,MxneYS2x1oHLTyJBLzKoRw,bFVP_d5M5wWHi9wUU1zXFw,In town for the mother in law's birthday and w...,3.0,2019-11-15 19:01:17
9569,-5NXoZeGBdx3Bdk70tuyCw,Po' Boys Restaurant,Wg9YyS9aTEsgrJwae0DwIQ,u-lYztfofTDgEXQ6EpcCVA,Po boys has a great selection off food and we ...,4.0,2019-10-03 22:52:29
9673,-5NXoZeGBdx3Bdk70tuyCw,Po' Boys Restaurant,9NOO4OyM5wOvHM67mSTmwg,V631vIG_cs72fBG3T-hbnQ,Great food here! I love the Italian Beef. The ...,4.0,2019-05-11 23:33:09


In [243]:
review_df.query("business_id=='-2q4dnUw0gGJniGW2aPamQ'")

Unnamed: 0,business_id,name,review_id,user_id,text,stars,date
7772,-2q4dnUw0gGJniGW2aPamQ,Fiesta Ranchera,fG8WAIDl4GZQYRYEfzNH6g,6MDjqs8QE2A_TA0CYi8-UQ,Me and my girlfriend walked into this place af...,1.0,2012-03-08 00:20:43
8568,-2q4dnUw0gGJniGW2aPamQ,Fiesta Ranchera,8Zd_i4Kwb1uzzvrAU37_9A,tq0mDXQeeAYnlYHiBEttbQ,Going here really boils down to a question of ...,2.0,2012-05-13 16:51:25
8690,-2q4dnUw0gGJniGW2aPamQ,Fiesta Ranchera,cU39f51OCGzn5vF_TvtO-g,fwSsSQXLvip6MkayAa_vyQ,I used to really like El Toro which was in the...,3.0,2011-04-11 00:44:04
9463,-2q4dnUw0gGJniGW2aPamQ,Fiesta Ranchera,1lRmEQYCMtgG-bdPGtgudQ,G-mC1khkbT2G4nUtvTGjTQ,"As others have said, steer clear of this place...",1.0,2011-06-20 02:11:36


In [256]:
review_df.query("business_id=='-2q4dnUw0gGJniGW2aPamQ'")

Unnamed: 0,business_id,name,review_id,user_id,text,stars,date
7772,-2q4dnUw0gGJniGW2aPamQ,Fiesta Ranchera,fG8WAIDl4GZQYRYEfzNH6g,6MDjqs8QE2A_TA0CYi8-UQ,Me and my girlfriend walked into this place af...,1.0,2012-03-08 00:20:43
8568,-2q4dnUw0gGJniGW2aPamQ,Fiesta Ranchera,8Zd_i4Kwb1uzzvrAU37_9A,tq0mDXQeeAYnlYHiBEttbQ,Going here really boils down to a question of ...,2.0,2012-05-13 16:51:25
8690,-2q4dnUw0gGJniGW2aPamQ,Fiesta Ranchera,cU39f51OCGzn5vF_TvtO-g,fwSsSQXLvip6MkayAa_vyQ,I used to really like El Toro which was in the...,3.0,2011-04-11 00:44:04
9463,-2q4dnUw0gGJniGW2aPamQ,Fiesta Ranchera,1lRmEQYCMtgG-bdPGtgudQ,G-mC1khkbT2G4nUtvTGjTQ,"As others have said, steer clear of this place...",1.0,2011-06-20 02:11:36


In [42]:
reviews_dictionary = Dictionary(reviews_tokenized)

In [184]:
reviews_dictionary.num_docs

701

In [185]:
review_df.business_id.unique().shape

(701,)

In [59]:
#corpus
reviews_corpus = [reviews_dictionary.doc2bow(doc) for doc in reviews_tokenized]

In [84]:
#tfidf with document being each restaurant and corpus being all restaurants
reviews_tfidf_model = TfidfModel(reviews_corpus)

In [85]:
reviews_tfidf_docs = [reviews_tfidf_model[review] for review in reviews_corpus]

In [87]:
#similarity indices for each doc
similarity_indices = MatrixSimilarity(reviews_tfidf_docs)

In [299]:
#doc example
fiesta = reviews_corpus[461]
fiesta_tfidf = reviews_tfidf_model[fiesta]

In [300]:
#similarity to each restaurant (ie doc)
fiesta_similarity = (
    pd.Series(similarity_indices[fiesta_tfidf])
)

In [304]:
fiesta_similarity.sort_values(ascending=False).index

Int64Index([461, 224, 613, 601, 425, 372, 203, 587, 647, 111,
            ...
            108,  67,   6, 438, 335, 235,  60, 582, 466, 164],
           dtype='int64', length=701)

In [305]:
doc_mapping.iloc[[203, 587, 647, 111]]

business_id
IHJKFxoWXONoz-JamfOnlg                 Meijer
pjO0ZRGpCwlaOzipZDa3HA             Sam’s Club
vRAM7lfJDY6pLZlxd4ge1Q    Walmart Supercenter
9qnlmW_Az2itTy0fmoE-7Q              Walgreens
Name: name, dtype: object

In [289]:
#get index:name mappings
doc_mapping = (
    review_df
    .groupby('business_id')
    ['name']
    .apply(lambda x: x.unique()[0])
)

#ie switch index to key and token str to object
token_mapping = {
    i:token 
    for token,i in reviews_dictionary.token2id.items()
} 

In [274]:
#for orange and brew and bevande coffee (doc indices 0 and 172):
#doc_id = 0
all_bag_of_words_list = []
for doc_id in range(len(reviews_corpus)):
    bag_of_words = (
        pd.DataFrame(
            {"frequency":dict(reviews_corpus[doc_id]),
             "tf_idf":dict(reviews_tfidf_docs[doc_id]),
             "business":doc_mapping[doc_id],
             "word":token_mapping
            }
        )
        .set_index(['business','word'])
    )
    all_bag_of_words_list.append(bag_of_words)                                 

In [275]:
all_bag_of_words_df = pd.concat(all_bag_of_words_list)

In [193]:
import plotly.express as px

In [205]:
restaurants = ['Orange & Brew','Bevande Coffee']

In [211]:
plot_df = (
    all_bag_of_words_df
    .loc[restaurants]
    .unstack('business')
    .reset_index()
)

plot_df.columns = [
    '_'.join([x for x in column if len(x)>0]) 
    for column in plot_df.columns
]

In [212]:
plot_df.head(2)

Unnamed: 0,word,frequency_Orange & Brew,frequency_Bevande Coffee,tf_idf_Orange & Brew,tf_idf_Bevande Coffee
0,!,,,,
1,"""",,,,


In [224]:
hover_data_list = [
    'word',
    'frequency_Orange & Brew',
    'frequency_Bevande Coffee'
]

#x_str = "frequency_Orange & Brew"
#y_str = "frequency_Bevande Coffee"

x_str = "tf_idf_Orange & Brew"
y_str = "tf_idf_Bevande Coffee"

In [219]:
plot_df.columns

Index(['word', 'frequency_Orange & Brew', 'frequency_Bevande Coffee',
       'tf_idf_Orange & Brew', 'tf_idf_Bevande Coffee'],
      dtype='object')

In [221]:
plot_df.word.unique().shape

(19799,)

In [225]:
fig = px.scatter(plot_df,
                 x=x_str, 
                 y=y_str, 
                 #color="species",
                 #size='petal_length', 
                 hover_data=hover_data_list)
fig.show()

In [None]:
#ratings

#location