In [56]:
import numpy as np
import pandas as pd
import timeit
import time
import json
import re
import itertools
import matplotlib.pyplot as plt
#import cupy
from sklearn import linear_model
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups

import spacy

In [4]:
# import some text, in this case restuarant reviews

yelp_business_datapath = '/home/matt_valley/PycharmProjects/insight_2020a_project/Resto_names/yelp_dataset/review.json'

num_entries = 100
users = []
with open(yelp_business_datapath) as fl:
    for i, line in enumerate(fl):
        users.append(json.loads(line))
        if i+1 >= num_entries:
            break
df = pd.DataFrame(users)

In [11]:
# load spacy model

model = 'en_core_web_sm' # for testing on laptop
#model = 'en_core_web_lg'
#model = 'en_vectors_web_lg' # many more words
nlp = spacy.load(model)
#sentencizer = nlp.create_pipe("sentencizer")
#nlp.add_pipe(sentencizer)

In [15]:
# rudimentary text chunking pipeline

all_sentences = []
all_sentence_entities = []
all_tokens = []
for r, review in enumerate(df.text):
    doc = nlp(review)
    tokens = [token.text for token in doc]
    sentences = [sent for sent in doc.sents]
    sentence_entities = [ent.text for ent in doc.ents]
    all_tokens.append(tokens)
    all_sentences.append(sentences)
    all_sentence_entities.append(sentence_entities)
    
df['tokens'] = all_tokens
df['sentences'] = all_sentences
df['entities'] = all_sentence_entities

In [32]:
# do it another way, from https://gist.github.com/narulkargunjan/5319ed32d092d1fa7b52fec3a774e0e5
columns=['text',
           'log_probability',
           'stop?',
           'punctuation?',
           'whitespace?',
           'number?',
           'out of vocab.?']
token_df = pd.DataFrame(columns=columns)
    
for r, review in enumerate(df.text):
    doc = nlp(review)
    token_attributes = [(token.orth_,
                         token.prob,
                         token.is_stop,
                         token.is_punct,
                         token.is_space,
                         token.like_num,
                         token.is_oov)
                        for token in doc]
    temp_df = pd.DataFrame(token_attributes, columns=columns)
    token_df = token_df.append(temp_df)


In [32]:
for r, review in enumerate(df.text[:2]):
    doc = nlp(review)
    for s,sent in enumerate(doc.sents):
        #print(sent.text)
        fragment = nlp(sent.text)
        for ent in fragment.ents:
            print(ent.text, ent.start_char, ent.end_char, ent.label_)
        #print(entities)

In [37]:
#vector_model = 'en_vectors_web_lg' # many more words
#nlp_vec = spacy.load(vector_model)        

MemoryError: Unable to allocate array with shape (321291300,) and data type float32

In [41]:
vectors = []
for s, sent in enumerate(df.sentences):
    doc = nlp(sent[0].text)
    vectors.append(doc.vector)
df['sentence_vector'] = vectors


In [64]:
df.head(3)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,entities,sentences,tokens,sentence_vector
0,Q1sbwvVQXV2734tPgoKj4Q,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1.0,6,1,0,Total bill for this horrible service? Over $8G...,2013-05-07 04:34:36,"[69, 3, 19 cents, Avoid Hospital]","[(Total, bill, for, this, horrible, service, ?...","[Total, bill, for, this, horrible, service, ?,...","[0.9207042, -0.35722563, 0.5429547, 0.63424003..."
1,GJXCdrto3ASJOqKeVWPi6Q,yXQM5uF2jS6es16SJzNHfg,NZnhc2sEQy3RmzKTZnqtwQ,5.0,0,0,0,I *adore* Travis at the Hard Rock's new Kelly ...,2017-01-14 21:30:33,"[the Hard Rock's, Kelly Cardenas Salon, Travis...","[(I), (*, adore), (*, Travis, at, the, Hard, R...","[I, *, adore, *, Travis, at, the, Hard, Rock, ...","[0.9344967, 3.0664394, 2.522025, -1.4153994, 2..."
2,2TzJjDVDEuAW6MR5Vuc1ug,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5.0,3,0,0,I have to say that this office really has it t...,2016-11-09 20:09:03,"[J. Phillipp, Jewel, Bailey, 80, a year, 25%]","[(I, have, to, say, that, this, office, really...","[I, have, to, say, that, this, office, really,...","[0.14864156, 1.6161582, 0.43045694, -2.1842675..."


In [70]:
n_features = 1000
n_components = 10
n_top_words = 20

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')


# data must be a list of strings
data = [sent for sent in df.text]

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time.time()
tf = tf_vectorizer.fit_transform(data)
tfidf = tfidf_vectorizer.fit_transform(data)
print("done in %0.3fs." % (time.time() - t0))


done in 0.035s.


In [44]:
sent_vec_mat = [vec for vec in df.sentence_vector.values]
sent_vec_mat = np.array(sent_vec_mat)
print(sent_vec_mat.shape)


(100, 96)


In [72]:
#Do LDA
k = 5

lda = LatentDirichletAllocation(n_components=k,random_state=0, verbose=1)
lda.fit(tf)

tf_feature_names = tf_vectorizer.get_feature_names()

print(tf_feature_names)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
['10', '100', '14', '15', '17', '20', '25', '30', '45', '50', '80', 'able', 'absolutely', 'accommodating', 'actually', 'add', 'added', 'admit', 'afraid', 'afternoon', 'ago', 'ahead', 'alcohol', 'allow', 'alright', 'amazing', 'ambiance', 'anna', 'appetite', 'appetizer', 'appetizers', 'appreciate', 'area', 'ask', 'asked', 'asking', 'ate', 'atmosphere', 'attention', 'authentic', 'available', 'avoid', 'away', 'awesome', 'awful', 'bachelorette', 'bacon', 'bad', 'balance', 'balcony', 'ball', 'balls', 'bar', 'barely', 'bartender', 'base', 'basically', 'bbq', 'beef', 'beer', 'beers', 'believe', 'bella', 'belly', 'best', 'bet', 'better', 'beware', 'big', 'biggest', 'birthday', 'bit', 'bite', 'bits', 'black', 