In [0]:
csv_name = 'reviews_pittsburgh.csv'

In [0]:
# import dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
import re
from nltk.stem.porter import PorterStemmer
import time
from nltk import FreqDist
from gensim.models import HdpModel
from scipy.stats import entropy
from gensim.models.coherencemodel import CoherenceModel
nltk.download('stopwords')
nltk.download('punkt')
print('Downloads Complete')

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Downloads Complete


In [0]:
def initial_clean(text):
    """
    Function to clean text of websites, email addresess and any punctuation
    We also lower case the text
    """
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower() # lower case the text
    text = nltk.word_tokenize(text)
    return text

def remove_stop_words(text):
    """
    Function that removes all stopwords from text
    """
    return [word for word in text if word not in stop_words]

def pos(word):
    return nltk.pos_tag([word])[0][1]

bad_pos = ['JJ']
def remove_bad_pos(text):
    """
    Function that removes all adjectives from text
    """
    return [word for word in text if pos(word) not in bad_pos]

stemmer = PorterStemmer()
def stem_words(text):
    """
    Function to stem words, so plural and singular are treated the same
    """
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # make sure we have no 1 letter words
    except IndexError: # the word "oed" broke this, so needed try except
        pass
    return text

def apply_all(text):
    """
    This function applies all the functions above into one
    """
    #return remove_stop_words(initial_clean(text))
    return stem_words(remove_bad_pos(remove_stop_words(initial_clean(text))))

In [0]:
def train_lda(data, chunksize):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    """
    dictionary = corpora.Dictionary(data['tokenized'])
    corpus = [dictionary.doc2bow(doc) for doc in data['tokenized']]
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                   alpha=1e-2, eta=0.5e-2, chunksize=chunksize, minimum_probability=0.0, passes=1)
    t2 = time.time()
    print("Time to train LDA model on ", len(df), "businesses: ", (t2-t1)/60, "min")
    return dictionary,corpus,lda

In [0]:
def train_hdp(data):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    """
    dictionary = corpora.Dictionary(data['tokenized'])
    corpus = [dictionary.doc2bow(doc) for doc in data['tokenized']]
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    hdp = HdpModel(corpus=corpus, id2word=dictionary)
    t2 = time.time()
    print("Time to train HDP model on ", len(df), "businesses: ", (t2-t1)/60, "min")
    return hdp

In [0]:
def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    # lets keep with the p,q notation above
    p = query[None,:].T # take transpose
    q = matrix.T # transpose matrix
    m = 0.5*(p + q)
    return np.sqrt(0.5*(entropy(p,m) + entropy(q,m)))

In [0]:
def get_most_similar_documents(query,matrix,k=10):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sims = jensen_shannon(query,matrix) # list of jensen shannon distances
    return sims.argsort()[:k] # the top k positional index of the smallest Jensen Shannon distances

In [0]:
# format the columns
df = pd.read_csv(csv_name)
df = df.groupby(['name'])['text'].apply(' '.join).reset_index()
df = df[df['text'].map(type) == str]
df.dropna(axis=0, inplace=True, subset=['text'])

# preprocess the text and business name and create new column "tokenized"
t1 = time.time()
df['tokenized'] = df['text'].apply(apply_all)
t2 = time.time()
print("Time to clean and tokenize", len(df), "businesses' reviews:", (t2-t1)/60, "min")

In [0]:
# config
k = 15000
num_topics = 100

In [0]:
# first get a list of all words
all_words = [word for item in list(df['tokenized']) for word in item]

# use nltk fdist to get a frequency distribution of all words
fdist = FreqDist(all_words)

# define a function only to keep words in the top k words
top_k_words,_ = zip(*fdist.most_common(k))
top_k_words = set(top_k_words)
def keep_top_k_words(text):
    return [word for word in text if word in top_k_words]
df['tokenized'] = df['tokenized'].apply(keep_top_k_words)

In [0]:
# train the topic model
hdp = train_hdp(df)

Time to train HDP model on  422 businesses:  0.46153692801793417 min


In [0]:
topic_info = hdp.print_topics(num_topics=20, num_words=10)

In [0]:
topic_info

[(0,
  '0.013*food + 0.011*good + 0.010*place + 0.008*order + 0.008*great + 0.007*like + 0.007*time + 0.006*go + 0.006*get + 0.006*one'),
 (1,
  '0.013*food + 0.011*place + 0.011*good + 0.008*order + 0.008*great + 0.007*like + 0.007*time + 0.007*get + 0.007*go + 0.006*one'),
 (2,
  '0.011*food + 0.010*good + 0.009*place + 0.008*burger + 0.008*great + 0.008*order + 0.007*like + 0.006*one + 0.006*time + 0.006*go'),
 (3,
  '0.014*food + 0.010*place + 0.010*good + 0.008*order + 0.007*great + 0.007*like + 0.006*time + 0.006*go + 0.006*get + 0.006*tri'),
 (4,
  '0.013*food + 0.010*good + 0.009*place + 0.009*order + 0.007*great + 0.007*time + 0.007*servic + 0.006*restaur + 0.006*like + 0.006*go'),
 (5,
  '0.012*food + 0.011*place + 0.011*good + 0.009*beer + 0.007*order + 0.007*great + 0.007*like + 0.007*pizza + 0.007*get + 0.006*go'),
 (6,
  '0.012*food + 0.009*good + 0.008*place + 0.007*get + 0.007*airport + 0.007*great + 0.006*time + 0.006*like + 0.006*go + 0.006*one'),
 (7,
  '0.010*place 

In [0]:
# train the topic model
dictionary,corpus,lda = train_lda(df, 211)

# get the topic distribution
doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in lda[corpus]])

query = pd.read_csv('reviews_pittsburgh.csv')
query = query.loc[query['name']=='Altius']
query = query.groupby(['name'])['text'].apply(' '.join).reset_index()
query = query[query['text'].map(type) == str]
query.dropna(axis=0, inplace=True, subset=['text'])
query['tokenized'] = query['text'].apply(apply_all)

  diff = np.log(self.expElogbeta)


Time to train LDA model on  422 businesses:  0.21156521638234457 min


In [0]:
# get the ids of the most similar businesses
new_bow = dictionary.doc2bow(query.iloc[0,2])
new_doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=new_bow)])
most_sim_ids = get_most_similar_documents(new_doc_distribution,doc_topic_dist)

# print the results
most_similar_df = df[df.index.isin(most_sim_ids)]
print('Similar to "{}": \n{}'.format(query['name'][0], most_similar_df['name'].reset_index(drop=True)))

Similar to "Altius": 
0                             Altius
1    Cioppino Restaurant & Cigar Bar
2        Eddie Merlot's - Pittsburgh
3            Eddie V's Prime Seafood
4                 Habitat Restaurant
5                            Le Mont
6           Monterey Bay Fish Grotto
7                      Sonoma Grille
8                 The Capital Grille
9                            Vue 412
Name: name, dtype: object


In [0]:
import pickle

In [0]:
pickle.dump(doc_topic_dist,open("processor.pkl","wb"))

In [0]:
preprocessed = pickle.load(open("processor.pkl","rb"))

In [0]:
most_sim_ids = get_most_similar_documents(new_doc_distribution,preprocessed)

# print the results
most_similar_df = df[df.index.isin(most_sim_ids)]
print('Similar to "{}": \n{}'.format(query['name'][0], most_similar_df['name'].reset_index(drop=True)))

Similar to "Altius": 
0                         Altius
1                      Bistro 19
2    Eddie Merlot's - Pittsburgh
3                         Eleven
4                        Le Mont
5        Morton's The Steakhouse
6                          Spoon
7             The Capital Grille
8      Toast! Kitchen & Wine Bar
9                        Vue 412
Name: name, dtype: object


In [0]:
def process_query(preprocessed, query):
  # SQL to pandas DataFrame w/ query
  query = query.groupby(['name'])['text'].apply(' '.join).reset_index()
  query = query[query['text'].map(type) == str]
  query.dropna(axis=0, inplace=True, subset=['text'])
  query['tokenized'] = query['text'].apply(apply_all)
  
  # read the cached pickle
  preprocessed = pickle.load(open("processor.pkl","rb"))
  
  # get the ids of the most similar businesses
  new_bow = dictionary.doc2bow(query.iloc[0,2])
  new_doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=new_bow)])
  most_sim_ids = get_most_similar_documents(new_doc_distribution,preprocessed)

  # print the results
  most_similar_df = df[df.index.isin(most_sim_ids)]
  print('Similar to "{}": \n{}'.format(query['name'][0], most_similar_df['name'].reset_index(drop=True)))