In [3]:
import os, codecs, json, sys, gc
from collections import Counter
import spacy
import pandas as pd
import itertools as it

from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

from spacy.lang import en

## Getting data

In [5]:
#data_directory = r"/home/oleksandr/Dropbox/Deep_Learning/YouTube/Modern_NLP"
data_directory = "" #"E:\\Dropbox\\Deep_Learning\\YouTube\\Modern_NLP"

business_filepath   = os.path.join(data_directory,'dataset','business.json')
reviews_filepath    = os.path.join(data_directory,'dataset','review.json')
review_txt_filepath = os.path.join(data_directory,'review_text_all.txt')

In [3]:
# Example of a business record
with codecs.open(business_filepath,encoding='utf_8') as f:
    test = f.readline()
print(test)

{"business_id": "YDf95gJZaq05wvo7hTQbbQ", "name": "Richmond Town Square", "neighborhood": "", "address": "691 Richmond Rd", "city": "Richmond Heights", "state": "OH", "postal_code": "44143", "latitude": 41.5417162, "longitude": -81.4931165, "stars": 2.0, "review_count": 17, "is_open": 1, "attributes": {"RestaurantsPriceRange2": 2, "BusinessParking": {"garage": false, "street": false, "validated": false, "lot": true, "valet": false}, "BikeParking": true, "WheelchairAccessible": true}, "categories": ["Shopping", "Shopping Centers"], "hours": {"Monday": "10:00-21:00", "Tuesday": "10:00-21:00", "Friday": "10:00-21:00", "Wednesday": "10:00-21:00", "Thursday": "10:00-21:00", "Sunday": "11:00-18:00", "Saturday": "10:00-21:00"}}



In [4]:
# Example of a review record
with codecs.open(reviews_filepath,encoding='utf_8') as f:
    test = f.readline()
print(test)

{"review_id":"VfBHSwC5Vz_pbFluy07i9Q","user_id":"cjpdDjZyprfyDG3RlkVG3w","business_id":"uYHaNptLzDLoV_JZ_MuzUA","stars":5,"date":"2016-07-12","text":"My girlfriend and I stayed here for 3 nights and loved it. The location of this hotel and very decent price makes this an amazing deal. When you walk out the front door Scott Monument and Princes street are right in front of you, Edinburgh Castle and the Royal Mile is a 2 minute walk via a close right around the corner, and there are so many hidden gems nearby including Calton Hill and the newly opened Arches that made this location incredible.\n\nThe hotel itself was also very nice with a reasonably priced bar, very considerate staff, and small but comfortable rooms with excellent bathrooms and showers. Only two minor complaints are no telephones in room for room service (not a huge deal for us) and no AC in the room, but they have huge windows which can be fully opened. The staff were incredible though, letting us borrow umbrellas for t

In [5]:
# Collecting categories
categories = list()
with codecs.open(business_filepath,encoding='utf_8') as f:
    for line in f:
        business = json.loads(line)
        categories.append(business[u'categories'])
flat_list = [item for sublist in categories for item in sublist]
categories = Counter(flat_list)
#print('There are {0:d} categories'.format(len(flat_list)))

# displaying most common categories
categories.most_common(n=20)

[('Restaurants', 51613),
 ('Shopping', 24595),
 ('Food', 23014),
 ('Beauty & Spas', 15139),
 ('Home Services', 13202),
 ('Health & Medical', 12033),
 ('Nightlife', 11364),
 ('Bars', 9868),
 ('Automotive', 9476),
 ('Local Services', 9343),
 ('Event Planning & Services', 8038),
 ('Active Life', 7427),
 ('Fashion', 6299),
 ('Sandwiches', 5864),
 ('Fast Food', 5792),
 ('American (Traditional)', 5737),
 ('Pizza', 5652),
 ('Coffee & Tea', 5565),
 ('Hair Salons', 5395),
 ('Hotels & Travel', 5188)]

In [6]:
# Collecting Bars IDs
bar_ids = set()
with codecs.open(business_filepath,encoding='utf_8') as f:
    for line in f:
        business = json.loads(line)
        if u'Bars' not in business[u'categories']:
            continue
        bar_ids.add(business[u'business_id'])
        

bar_ids = frozenset(bar_ids)

print('There are {0:d} bars IDs'.format(len(bar_ids)))

There are 9868 bars IDs


In [9]:
# Fetching reviews and storing them into txt file
review_count = 0
with codecs.open(review_txt_filepath,'w',encoding='utf_8') as review_txt:
    with codecs.open(reviews_filepath, encoding='utf_8') as reviews_json:
        for review_json in reviews_json:
            review = json.loads(review_json)
            if review[u'business_id'] not in bar_ids:
                continue
            review_txt.write(review[u'text'].replace('\n','')+'\n')
            review_count += 1

print('There are {0:d} reviews'.format(review_count))       

There are 714912 reviews


## NLP

In [7]:
#nlp = spacy.load('en_core_web_md')
nlp =  spacy.load('en')

In [8]:
# Sample review
with codecs.open(review_txt_filepath, encoding='utf_8') as f:
    sample_review = list(it.islice(f,0,10))[1]
    

print(sample_review)

So I know Christy's is a Madison tradition with those lovely views of Lake Waubesa. But this place is a one-track pony... location, location, location. This is a family-owned bar/restaurant and the owners know they have a captive audience. The beer is cold and the food is average, but the service is atrocious. On a recent beautiful Saturday afternoon, there were only two servers for the outside seating area.  There's no host, so we (4 adults and 1 kid) sat at a picnic table and proceeded to wait at least 10 minutes to no avail. We eventually went inside to order drinks and lunch. At no time during our visit did a server ever approach our table. I'd certainly be inclined to write off this awful experience to a bad day or poor staffing, but unfortunately this is more the rule rather than exception. One really weird thing is the Friday Fish Fry that features a special menu available ONLY inside, this isn't communicated very well;  so you'll grab a table outside, wait to order only to lear

In [9]:
%%time
parsed_review = nlp(sample_review)


CPU times: user 162 ms, sys: 12.2 ms, total: 174 ms
Wall time: 73.2 ms


In [24]:
# segmentation
sents = parsed_review.sents
for num, sent in enumerate(sents):
    print(num+1,":",sent)
    print("")

1 : So I know Christy's is a Madison tradition with those lovely views of Lake Waubesa.

2 : But this place is a one-track pony... location, location, location.

3 : This is a family-owned bar/restaurant and the owners know they have a captive audience.

4 : The beer is cold and the food is average, but the service is atrocious.

5 : On a recent beautiful Saturday afternoon, there were only two servers for the outside seating area.  

6 : There's no host, so we (4 adults and 1 kid) sat at a picnic table and proceeded to wait at least 10 minutes to no avail.

7 : We eventually went inside to order drinks and lunch.

8 : At no time during our visit did a server ever approach our table.

9 : I'd certainly be inclined to write off this awful experience to a bad day or poor staffing, but unfortunately this is more the rule rather than exception.

10 : One really weird thing is the Friday Fish Fry that features a special menu available ONLY inside, this isn't communicated very well;  so you'

In [121]:
parsed_review.ents

(Fremont, about four, Vegas, )

In [25]:
# Named entity detection
for num, entity in enumerate(parsed_review.ents):
    print(num+1,':',entity,'-',entity.label_)

1 : Christy - PERSON
2 : Madison - PERSON
3 : Lake Waubesa - LOC
4 : one - CARDINAL
5 : Saturday - DATE
6 : afternoon - TIME
7 : only two - CARDINAL
8 : 4 - CARDINAL
9 : 1 - CARDINAL
10 : at least 10 minutes - TIME
11 : One - CARDINAL
12 :   - NORP
13 : 
 - GPE


In [26]:
# part of speech tagging
tokens    = [token.orth_ for token in parsed_review]
token_pos = [token.pos_ for token in parsed_review]

pd.DataFrame(list(zip(tokens,token_pos)),columns=['Token','part_of_speech'])

Unnamed: 0,Token,part_of_speech
0,So,ADV
1,I,PRON
2,know,VERB
3,Christy,PROPN
4,'s,PART
5,is,VERB
6,a,DET
7,Madison,PROPN
8,tradition,NOUN
9,with,ADP


In [27]:
# Normalization
token_lemma = [token.lemma_ for token in parsed_review]
token_shape = [token.shape_ for token in parsed_review]

pd.DataFrame(list(zip(tokens,token_lemma,token_shape)),
               columns=['Token','token_lemma','token_shape'])

Unnamed: 0,Token,token_lemma,token_shape
0,So,so,Xx
1,I,-PRON-,X
2,know,know,xxxx
3,Christy,christy,Xxxxx
4,'s,'s,'x
5,is,be,xx
6,a,a,x
7,Madison,madison,Xxxxx
8,tradition,tradition,xxxx
9,with,with,xxxx


In [28]:
# token-level analysis
token_entity_type = [token.ent_type_ for token in parsed_review]
token_entity_iob  = [token.ent_iob_ for token in parsed_review]
pd.DataFrame(list(zip(tokens,token_entity_type,token_entity_iob)),
             columns=['Token','token_type','token_iob'])

Unnamed: 0,Token,token_type,token_iob
0,So,,O
1,I,,O
2,know,,O
3,Christy,PERSON,B
4,'s,,O
5,is,,O
6,a,,O
7,Madison,PERSON,B
8,tradition,,O
9,with,,O


* stopword
* punctuation
* whitespace
* a number
* spacy default vocaluary ?

In [29]:
token_attr = [(token.text,
               token.prob,
               token.is_stop,
               token.is_punct,
               token.is_space,
               token.like_num,
               token.is_oov,
               token.lemma_)
             for token in parsed_review]
df = pd.DataFrame(token_attr,columns=['Token','Log proba','stop ?','punct ?',
                                 'whitespace ?','number ?','out of vocab ?','Lemma'])
df.loc[:,"stop ?" : "out of vocab ?"] = \
df.loc[:,"stop ?" : "out of vocab ?"].applymap(lambda x: 'Yes' if x else '')
df

Unnamed: 0,Token,Log proba,stop ?,punct ?,whitespace ?,number ?,out of vocab ?,Lemma
0,So,-20.0,,,,,Yes,so
1,I,-20.0,,,,,Yes,-PRON-
2,know,-20.0,,,,,Yes,know
3,Christy,-20.0,,,,,Yes,christy
4,'s,-20.0,,,,,Yes,'s
5,is,-20.0,Yes,,,,Yes,be
6,a,-20.0,Yes,,,,Yes,a
7,Madison,-20.0,,,,,Yes,madison
8,tradition,-20.0,,,,,Yes,tradition
9,with,-20.0,Yes,,,,Yes,with


## Phrase modeling

In [10]:
# helper functions
def punct_space(token):
    return token.is_punct or token.is_space

def line_review(filename):
    with codecs.open(filename,encoding='utf_8') as f:
        for review in f:
            yield review
            
def lematized_sentence_corpus(filename):
    for parsed_review in nlp.pipe(line_review(filename),n_threads=4,batch_size=10000):
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent if not punct_space(token)])

unigrams_sentences_filepath = os.path.join(data_directory,'unigram_sents_all.txt')

In [None]:
%%time
count = 0
with codecs.open(unigrams_sentences_filepath,'w',encoding='utf_8') as f:
    for sent in lematized_sentence_corpus(review_txt_filepath):
        f.write(sent + '\n')
        

In [11]:
# invoking gensim
unigram_sentences = LineSentence(unigrams_sentences_filepath)

In [12]:
for unigram_sents in it.islice(unigram_sentences,210,220):
    print(" ".join(unigram_sents))

-PRON- have find a few great place there but no food especially asian have actually be that great until fin
-PRON- go here on a sunday evening and be concern -PRON- would not be open but -PRON- be until 10 pm
-PRON- be greet in japanese and seat immediately by -PRON- extremely friendly and attentive server
food be a bit on the pricey side but without there be much competition in term of izakaya in oakville -PRON- be not surprised and -PRON- do not break the bank
personal favorite be the selection of fish for three see picture so beautiful and the eastern style caesar who would not love a drink with a crab leg in it!).overall i be super impressed with the quality and variety of food as well as the service
-PRON- could not recommend fin enough to those starve for a bit of asian food in oakville!disclaimer if -PRON- be from toronto and eat izakaya all the time this may be a touch pricey for -PRON-
warn
this be not the same fin as before sell to new owner
super dry pull pork rice burger an

In [19]:
bigram_model = Phrases(unigram_sentences)
bigram_model.save('bigram_model')

In [13]:
# Loading the saved model
#bigram_model = Phrases.load('bigram_model')

In [20]:
for unigram_sents in it.islice(unigram_sentences,210,220):
    print(bigram_model[unigram_sents])

['-PRON-', 'have', 'find', 'a', 'few', 'great', 'place', 'there', 'but', 'no', 'food', 'especially', 'asian', 'have', 'actually', 'be', 'that', 'great', 'until', 'fin']
['-PRON-', 'go', 'here', 'on', 'a', 'sunday', 'evening', 'and', 'be', 'concern', '-PRON-', 'would', 'not', 'be', 'open', 'but', '-PRON-', 'be', 'until', '10', 'pm']
['-PRON-', 'be', 'greet', 'in', 'japanese', 'and', 'seat_immediately', 'by', '-PRON-', 'extremely', 'friendly', 'and', 'attentive', 'server']
['food', 'be', 'a', 'bit', 'on', 'the', 'pricey', 'side', 'but', 'without', 'there', 'be', 'much', 'competition', 'in', 'term', 'of', 'izakaya', 'in', 'oakville', '-PRON-', 'be', 'not', 'surprised', 'and', '-PRON-', 'do', 'not', 'break', 'the', 'bank']
['personal_favorite', 'be', 'the', 'selection', 'of', 'fish', 'for', 'three', 'see', 'picture', 'so', 'beautiful', 'and', 'the', 'eastern', 'style', 'caesar', 'who', 'would', 'not', 'love', 'a', 'drink', 'with', 'a', 'crab_leg', 'in', 'it!).overall', 'i', 'be', 'super', 



## Now that we have a trained phrase model for words pairs, <br/> let's apply it to the review sentences data and explore the results

In [21]:
bigram_sententes_filepath = os.path.join(data_directory,'bigram_sentences_all.txt')

with codecs.open(bigram_sententes_filepath,'w',encoding='utf-8') as f:
    for sent in unigram_sentences:
        bigram_sent = " ".join(bigram_model[sent])
        f.write(bigram_sent + '\n')



In [22]:
bigram_sentences = LineSentence(bigram_sententes_filepath)
for sent in it.islice(bigram_sentences,210,220):
    print(" ".join(sent))

-PRON- have find a few great place there but no food especially asian have actually be that great until fin
-PRON- go here on a sunday evening and be concern -PRON- would not be open but -PRON- be until 10 pm
-PRON- be greet in japanese and seat_immediately by -PRON- extremely friendly and attentive server
food be a bit on the pricey side but without there be much competition in term of izakaya in oakville -PRON- be not surprised and -PRON- do not break the bank
personal_favorite be the selection of fish for three see picture so beautiful and the eastern style caesar who would not love a drink with a crab_leg in it!).overall i be super impressed with the quality and variety of food as well as the service
-PRON- could not recommend fin enough to those starve for a bit of asian food in oakville!disclaimer if -PRON- be from toronto and eat izakaya all the time this may be a touch pricey for -PRON-
warn
this be not the same fin as before sell to new owner
super dry pull_pork rice burger an

In [23]:
# Apply again to build trigram model
trigram_model = Phrases(bigram_sentences)
trigram_model.save('trigram_model')

In [31]:
#trigram_model = Phrases.load('trigram_model')

In [8]:
# # These are the usual ipython objects, including this one you are creating
# ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# # Get a sorted list of the objects and their sizes
# sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') 
#         and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

In [25]:
trigram_sentences_filepath = os.path.join(data_directory,'trigram_sentences_all.txt')

with codecs.open(trigram_sentences_filepath,'w',encoding='utf-8') as file:
    for sent in bigram_sentences:
        trigram_sent = " ".join(trigram_model[sent])
        file.write(trigram_sent + '\n')
    

trigram_sentences = LineSentence(trigram_sentences_filepath)
for sent in it.islice(trigram_sentences,240,250):
    print(sent)

## Final step of data preprocessing : <br/>run the complete text of the reviews through a pipeline that applies our text normalization and phrase models 

In [26]:
trigram_reviews_filespath = os.path.join(data_directory,
                                        'trigram_transformed_reviews_all.txt')

In [29]:
%%time

with codecs.open(trigram_reviews_filespath,'w',encoding='utf_8') as f:
    for parsed_review in nlp.pipe(line_review(review_txt_filepath),batch_size=10000,n_threads=4):
        # lemmatize the text, removing punctuation and whitespace
        unigram_review = [token.lemma_ for token in parsed_review if not punct_space(token)]
        
        # apply models
        bigram_review = bigram_model[unigram_review]
        trigram_review = trigram_model[bigram_review]
        
        # remove any remaining stopwords
        trigram_review = [term for term in trigram_review if term not in spacy.lang.en.STOP_WORDS]

        # write the tramsformed review as a line in the new line
        trigram_review = " ".join(trigram_review)
        f.write(trigram_review + '\n')



CPU times: user 11h 10min 40s, sys: 32min 3s, total: 11h 42min 43s
Wall time: 3h 48min 6s


In [31]:
print('Original:' + '\n')

for review in it.islice(line_review(review_txt_filepath),0,1):
    print(review)
    
print('-------' + '\n')
print('Transformed:'+'\n')
with codecs.open(trigram_reviews_filespath,encoding='utf_8') as f:
    for review in it.islice(f,0,1):
        print(review)

Original:

This is my 9 year olds review.     Hamburgers are great and the cheese curds are awesome.   There is a Great Lake View

-------

Transformed:

-PRON- 9 year_old review hamburger great cheese_curd awesome great lake_view



 ## Topic modeling with Latent Dirichlet Allocation (LDA)

In [73]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis
import pyLDAvis.gensim

import warnings

In [34]:
!conda install -c memex pyldavis 

Solving environment: failed

UnsatisfiableError: The following specifications were found to be in conflict:
  - lightgbm
  - pyldavis
Use "conda info <package>" to see the dependencies for each package.



In [45]:
# Learn the dictionary

trigram_dict_filepath = os.path.join(data_directory,'trigram_dict_all.dict')
trigram_reviews = LineSentence(trigram_reviews_filespath)

trigram_dict = Dictionary(trigram_reviews)

trigram_dict.filter_extremes(no_below=10,no_above=0.4)

trigram_dict.compactify()

trigram_dict.save(trigram_dict_filepath)

In [51]:
# Bag of words model
trigram_bow_filepath = os.path.join(data_directory,'trigram_bow_corpus_all.mm')

def trigram_bow_generator(filepath):
    for review in LineSentence(filepath):
        yield trigram_dict.doc2bow(review)

# Generate bow for all reviews and save them as a matrix

%%time
MmCorpus.serialize(trigram_bow_filepath,trigram_bow_generator(trigram_reviews_filespath))


trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

In [60]:
lda_model_filepath = os.path.join(data_directory,'lda_model_all')
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda = LdaMulticore(trigram_bow_corpus,
                       num_topics=50,
                       id2word=trigram_dict,
                       workers=3)

lda.save(lda_model_filepath)

In [96]:
lda.bound(trigram_bow_corpus)

-250404219.79660738

In [64]:
def explore_topic(topic_number, topn = 25):
    print('{:20} {}'.format('term','frequency') + '\n')
    for term, frequency in lda.show_topic(topic_number, topn=topn):
        print('{:20} {:.3f}'.format(term,round(frequency,3)))


In [68]:
explore_topic(topic_number=3)

term                 frequency

mom                  0.048
brother              0.023
dad                  0.023
like                 0.009
parent               0.009
morels               0.008
bachi                0.008
's                   0.007
law                  0.007
sister               0.007
mother               0.007
father               0.007
calzone              0.006
cowboy               0.006
home                 0.005
year_old             0.005
know                 0.005
tamale               0.005
ball                 0.004
way                  0.004
think                0.004
treasure_island      0.004
22                   0.003
jersey               0.003
claim                0.003


In [81]:
LDAvis_prepared = pyLDAvis.gensim.prepare(lda,trigram_bow_corpus,trigram_dict)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [91]:
test=pyLDAvis.prepared_data_to_html(LDAvis_prepared)

with open('test.html','w') as f:
    f.write(test)

In [115]:
def get_sample_review(k):
    return list(it.islice(line_review(review_txt_filepath), k,k+1))[0]

def lda_description(review_text, min_topic_freq=0.05):
    parsed_review = nlp(review_text)
    unig_rev = [token.lemma_ for token in parsed_review if not punct_space(token)]
    bigr_rev = bigram_model[unig_rev]
    trig_rev = trigram_model[bigr_rev]
    trig_rev = [term for term in trig_rev if not term in spacy.lang.en.STOP_WORDS]
    rev_bow = trigram_dict.doc2bow(trig_rev)
    rev_lda = lda[rev_bow]
    rev_lda = sorted(rev_lda, key = lambda x: - x[1])
    
    for k, freq in rev_lda:
        if freq < min_topic_freq:
            break
        print('{:25} {}'.format(k,round(freq, 3)))

In [116]:
sample_rev = get_sample_review(50)
print(sample_review)

So I know Christy's is a Madison tradition with those lovely views of Lake Waubesa. But this place is a one-track pony... location, location, location. This is a family-owned bar/restaurant and the owners know they have a captive audience. The beer is cold and the food is average, but the service is atrocious. On a recent beautiful Saturday afternoon, there were only two servers for the outside seating area.  There's no host, so we (4 adults and 1 kid) sat at a picnic table and proceeded to wait at least 10 minutes to no avail. We eventually went inside to order drinks and lunch. At no time during our visit did a server ever approach our table. I'd certainly be inclined to write off this awful experience to a bad day or poor staffing, but unfortunately this is more the rule rather than exception. One really weird thing is the Friday Fish Fry that features a special menu available ONLY inside, this isn't communicated very well;  so you'll grab a table outside, wait to order only to lear

In [117]:
lda_description(sample_review)

                       25 0.3720000088214874
                       28 0.14499999582767487
                       33 0.0860000029206276
                       19 0.08399999886751175
                       47 0.0729999989271164
                       42 0.057999998331069946
                       14 0.05400000140070915




# Word2vec

In [15]:
from gensim.models import Word2Vec
trigram_sents = LineSentence(trigram_sentences_filepath)
word2vec_model_path = os.path.join(data_directory,'word2vec_model_all')

In [18]:
food2vec = Word2Vec(trigram_sents,size=100,window=5,min_count=20,sg=1)
food2vec.save(word2vec_model_path)

1

In [17]:
for i in range(2):
    print(i)
    food2vec.train(trigram_sents)
    

KeyboardInterrupt: 