In [0]:
# import dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim import models, corpora, similarities
import re
from nltk.stem.porter import PorterStemmer
import time
from nltk import FreqDist
from scipy.stats import entropy
import matplotlib.pyplot as plt

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
print('Downloads Complete')

In [0]:
def initial_clean(text):
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower() # lower case the text
    text = nltk.word_tokenize(text)
    return text

stop_words = stopwords.words('english')
def remove_stop_words(text):
    return [word for word in text if word not in stop_words]

def pos(word):
    return nltk.pos_tag([word])[0][1]

informative_pos = ('JJ','VB', 'NN','RBS','VBP','IN','RBR','JJR','JJS','PDT','RP','UH','FW','NNS','VBN','VBG')
def remove_uninformative_pos(text):
    tagged_words = nltk.pos_tag(text)
    return [word for word, tag in tagged_words if tag in informative_pos]
  
clutter = ['food','place','good','order','great','like',
           'service','time','go','ordered','get','love',
           'best','come','eat','dont','tried','try','ask',
           'nice','restaurant','ive','im','didnt']
def remove_garbage(text):
  return [word for word in text if word not in clutter]

stemmer = PorterStemmer()
def stem_words(text):
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # make sure we have no 1 letter words
    except IndexError: # the word "oed" broke this, so needed try except
        pass
    return text

def apply_all(text):
    return stem_words(remove_garbage(remove_uninformative_pos(remove_stop_words(initial_clean(text)))))

In [0]:
csv_name = 'csv_reviews_mesa.csv'

In [0]:
# format the columns
df = pd.read_csv(csv_name)
df = df.groupby(['name'])['text'].apply(' '.join).reset_index()
df = df[df['text'].map(type) == str]
df.dropna(axis=0, inplace=True, subset=['text'])

# preprocess the text and business name and create new column "tokenized"
t1 = time.time()
df['tokenized'] = df['text'].apply(apply_all)
t2 = time.time()
print("Time to clean and tokenize", len(df), "businesses' reviews:", (t2-t1)/60, "min")

In [0]:
num_topics = 12
k = 10000

In [0]:
# first get a list of all words
all_words = [word for item in list(df['tokenized']) for word in item]

# use nltk fdist to get a frequency distribution of all words
fdist = FreqDist(all_words)

# define a function only to keep words in the top k words
top_k_words,_ = zip(*fdist.most_common(k))
top_k_words = set(top_k_words)
def keep_top_k_words(text):
    return [word for word in text if word in top_k_words]
df['tokenized'] = df['tokenized'].apply(keep_top_k_words)

In [0]:
def train_hdp(data):
    """
    This function trains the hdp model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    """
    dictionary = corpora.Dictionary(data['tokenized'])
    corpus = [dictionary.doc2bow(doc) for doc in data['tokenized']]
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    hdp = HdpModel(corpus=corpus, id2word=dictionary)
    t2 = time.time()
    print("Time to train HDP model on ", len(df), "businesses: ", (t2-t1)/60, "min")
    return dictionary,corpus,hdp
  
# train the topic model
dictionary,corpus,hdp = train_hdp(df)

# get the topic distribution
doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in hdp[corpus]])

In [0]:
def train_lda(data, chunksize):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    """
    dictionary = corpora.Dictionary(data['tokenized'])
    corpus = [dictionary.doc2bow(doc) for doc in data['tokenized']]
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                   alpha=1e-2, eta=0.5e-2, chunksize=chunksize, minimum_probability=0.0, passes=1)
    t2 = time.time()
    print("Time to train LDA model on ", len(df), "businesses: ", (t2-t1)/60, "min")
    return dictionary,corpus,lda
  
# train the topic model
dictionary,corpus,lda = train_lda(df, 300)

# get the topic distribution
doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in lda[corpus]])

In [0]:
def train_lsi(data):
    """
    This function trains the lsi model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    """
    dictionary = corpora.Dictionary(data['tokenized'])
    corpus = [dictionary.doc2bow(doc) for doc in data['tokenized']]
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lsi = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,)
    t2 = time.time()
    print("Time to train LSI model on ", len(df), "businesses: ", (t2-t1)/60, "min")
    return dictionary,corpus,lsi
  
# train the topic model
dictionary,corpus,lsi = train_lsi(df)

# get the topic distribution
doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in lsi[corpus]])

In [0]:
query = pd.read_csv(csv_name)
query = query.loc[query['name']=="Alessia's Ristorante Italiano"]
query = query.groupby(['name'])['text'].apply(' '.join).reset_index()
query = query[query['text'].map(type) == str]
query.dropna(axis=0, inplace=True, subset=['text'])
query['tokenized'] = query['text'].apply(apply_all)

In [0]:
def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    # lets keep with the p,q notation above
    p = query[None,:].T # take transpose
    q = matrix.T # transpose matrix
    m = 0.5*(p + q)
    return np.sqrt(0.5*(entropy(p,m) + entropy(q,m)))
  
def get_most_similar_documents(query,matrix,k=10):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sims = jensen_shannon(query,matrix) # list of jensen shannon distances
    return sims.argsort()[:k] # the top k positional index of the smallest Jensen Shannon distances

In [0]:
# get the ids of the most similar businesses
new_bow = dictionary.doc2bow(query.iloc[0,2])
new_doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=new_bow)])
most_sim_ids = get_most_similar_documents(new_doc_distribution,doc_topic_dist)

# print the results
most_similar_df = df[df.index.isin(most_sim_ids)]
print('Similar to "{}": \n{}'.format(query['name'][0], most_similar_df['name'].reset_index(drop=True)))

In [0]:
# Compute Coherence Score using c_v
coherence_model = CoherenceModel(model=lda, texts=df['tokenized'], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_score)

In [0]:
# import pickle

# pickle.dump(doc_topic_dist,open("processor.pkl","wb"))
# preprocessed = pickle.load(open("processor.pkl","rb"))

# most_sim_ids = get_most_similar_documents(new_doc_distribution,preprocessed)

# # print the results
# most_similar_df = df[df.index.isin(most_sim_ids)]
# print('Similar to "{}": \n{}'.format(query['name'][0], most_similar_df['name'].reset_index(drop=True)))

In [0]:
# def process_query(preprocessed, query):
#   # SQL to pandas DataFrame w/ query
#   query = query.groupby(['name'])['text'].apply(' '.join).reset_index()
#   query = query[query['text'].map(type) == str]
#   query.dropna(axis=0, inplace=True, subset=['text'])
#   query['tokenized'] = query['text'].apply(apply_all)
  
#   # read the cached pickle
#   preprocessed = pickle.load(open("processor.pkl","rb"))
  
#   # get the ids of the most similar businesses
#   new_bow = dictionary.doc2bow(query.iloc[0,2])
#   new_doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=new_bow)])
#   most_sim_ids = get_most_similar_documents(new_doc_distribution,preprocessed)

#   # print the results
#   most_similar_df = df[df.index.isin(most_sim_ids)]
#   print('Similar to "{}": \n{}'.format(query['name'][0], most_similar_df['name'].reset_index(drop=True)))

In [0]:
def evaluate_graph(dictionary, corpus, texts, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    step = 1
    for num_topics in range(60, limit, step):
        lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lm_list.append(lm)
        cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v.append(cm.get_coherence())
        print('Level {} Complete'.format(num_topics))
        
    # Show graph
    x = range(60, limit, step)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    
    return lm_list, c_v

In [0]:
%%time
lmlist, c_v = evaluate_graph(dictionary=dictionary, corpus=corpus, texts=df['tokenized'], limit=70)