In [1]:
# import dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim import models, corpora, similarities
import re
from nltk.stem.porter import PorterStemmer
import time
from nltk import FreqDist
from scipy.stats import entropy
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim
import os
from gensim.matutils import kullback_leibler, jaccard, hellinger
import logging
import warnings
warnings.filterwarnings('ignore')

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
print('Downloads Complete')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...


Downloads Complete


[nltk_data]   Package punkt is already up-to-date!


In [6]:
stop_words = stopwords.words('english')
stemmer = PorterStemmer()

def initial_clean(text):
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower() # lower case the text
    text = nltk.word_tokenize(text)
    return text

def remove_stop_words(text):
    return [word for word in text if word not in stop_words]

def pos(word):
    return nltk.pos_tag([word])[0][1]

informative_pos = ('JJ','VB', 'NN','RBS','VBP','IN','RBR','JJR','JJS','PDT','RP','UH','FW','NNS','VBN','VBG')

def remove_uninformative_pos(text):
    tagged_words = nltk.pos_tag(text)
    return [word for word, tag in tagged_words if tag in informative_pos]
  
clutter = ['food','place','good','order','great','like',
           'service','time','go','ordered','get','love',
           'best','come','eat','dont','tried','try','ask',
           'nice','restaurant','ive','im','didnt']

def remove_garbage(text):
    return [word for word in text if word not in clutter]

def stem_words(text):
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # make sure we have no 1 letter words
    except IndexError: # the word "oed" broke this, so needed try except
        pass
    return text

def apply_all(text):
    return stem_words(remove_garbage(remove_uninformative_pos(remove_stop_words(initial_clean(text)))))

def get_top_k_words(df, k = 10000):
    # first get a list of all words
    all_words = [word for item in list(df['tokenized']) for word in item]
    
    # use nltk fdist to get a frequency distribution of all words
    fdist = FreqDist(all_words)
    
    # define a function only to keep words in the top k words
    top_k_words, _ = zip(*fdist.most_common(k))
    top_k_words = set(top_k_words)
    
    return top_k_words

def keep_top_k_words(text, *top_k_words):
    return [word for word in text if word in top_k_words]

def transform_dataset(df):
    # format the columns
    df = df.groupby(['business_id', 'name'])['text'].apply(' '.join).reset_index()
    df = df[df['text'].map(type) == str]
    df.dropna(axis=0, inplace=True, subset=['text'])
    return df

def gen_tokenized_column(df):
    # preprocess the text and business name and create new column "tokenized"
    df['tokenized'] = df['text'].apply(apply_all)
    top_k_words = get_top_k_words(df)
    df['tokenized'] = df['tokenized'].apply(keep_top_k_words, args=(top_k_words))
    return df

def preprocess_dataset(df):
    t1 = time.time()
    preprocessed_df = gen_tokenized_column(transform_dataset(df))
    t2 = time.time()
    print("Time to clean and tokenize", len(df), "businesses' reviews:", (t2-t1)/60, "min")
    return preprocessed_df
    
def get_coherence_score(model, texts, dictionary):
    coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    return coherence_score

def get_dictionary_corpus(data, no_below=5, no_above=0.1):
    dictionary = corpora.Dictionary(data)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    corpus = [dictionary.doc2bow(doc) for doc in data]
    return dictionary, corpus

def get_perplexity(model, corpus):
    # a measure of how good the model is; lower the better
    return lda_model.log_perplexity(corpus)

def train_lda(corpus, id2word, chunksize=2000, num_topics=12, alpha='auto', eta='auto', passes=1, iterations=50,
              minimum_probability=0.01, eval_every=10, random_state=None):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    """
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, alpha=alpha, eta=eta, 
                   chunksize=chunksize, minimum_probability=minimum_probability, passes=passes, 
                   iterations=iterations, eval_every=eval_every, random_state=random_state)
    
    t2 = time.time()
    print("Time to train LDA model on businesses: ", (t2-t1)/60, "min")
    
    return lda

def train_hdp(corpus, id2word, chunksize=2000, T=150, K=15, random_state=None):
    """
    This function trains the hdp model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    """
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    hdp = HdpModel(corpus=corpus, id2word=id2word, T=T, K=K, chunksize=chunksize, random_state=random_state)
    t2 = time.time()
    print("Time to train HDP model on businesses: ", (t2-t1)/60, "min")
    
    return hdp

def train_lsi(corpus, id2word, num_topics=12, chunksize=2000, onepass=True, power_iters=2, extra_samples=100):
    """
    This function trains the lsi model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    """
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lsi = LsiModel(corpus=corpus, num_topics=num_topics, id2word=id2word, chunksize=chunksize)
    t2 = time.time()
    print("Time to train LSI model on businesses: ", (t2-t1)/60, "min")
    
    return lsi

def get_most_similar_documents(query, corpus, dictionary, k=10):
    distances = []
    for c in corpus:
        distances.append(kullback_leibler(query, c, num_features=len(dictionary)))
    
    indices = np.array(distances).argsort()[:k]
    return indices

def get_topic_dist(model, corpus):
    doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in model[corpus]])
    return doc_topic_dist

def get_most_similar_businesses(query_data, corpus, dictionary, model):
    query_bow = dictionary.doc2bow(query_data)
    most_sim_ids = get_most_similar_documents(model[query_bow], model[corpus], dictionary)
    return most_sim_ids 


In [7]:
# Find optimal number of topics by computing coherence score for LDA
def select_num_topics_LDA(dictionary, corpus, texts, end, start=3, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    end : Max num of topics
    start: Min num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    
    model_list = []
    coherence_values = []

    for num_topics in range(start, end + 1, step):
        model = train_lda(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=0)
        model_list.append(model)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
        print('progress: num of topics: ', num_topics)

    return model_list, coherence_values


In [8]:

def select_num_topics_HDP(dictionary, corpus, texts, end, start=3, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    end : Max num of topics
    start: Min num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the HDP model with respective number of topics
    """
    

    coherence_values = []
    model_list = []
    
    for num_topics in range(start, end + 1, step): 
        model = train_hdp(corpus=corpus, id2word=dictionary, T=num_topics, random_state=0)
        model_list.append(model)
        
        topics = []
        for topic_id, topic in model.show_topics(num_topics=num_topics, formatted=False):
            topic = [word for word, _ in topic]
            topics.append(topic)
            
        coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
        print('progress: num of topics: ', num_topics)
    
    return model, coherence_values


In [9]:
# Find optimal number of topics by computing coherence score for LSI
def select_num_topics_LSI(dictionary, corpus, texts, end, start=3, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    end : Max num of topics
    start: Min num of topics

    Returns:
    -------
    model_list : List of LSI topic models
    coherence_values : Coherence values corresponding to the Lsi model with respective number of topics
    """
    
    model_list = []
    coherence_values = []

    for num_topics in range(start, end + 1, step):
        model = train_lsi(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        
        topics = []
        for topic_id, topic in model.show_topics(num_topics=num_topics, formatted=False):
            topic = [word for word, _ in topic]
            topics.append(topic)

        coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, 
                                         coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
        print('progress: num of topics: ', num_topics)

    return model_list, coherence_values


In [15]:
# Tune hyperparameter top_k_words parameter num_topics
def eval_top_k_words():
    df = pd.read_csv('mesa_5000.csv', index_col=0)
    df['tokenized'] = df['tokenized'].apply(eval)
    dictionary, corpus = get_dictionary_corpus(df['tokenized'])
    print('preprocessing finished')
    _, lsi_coherence = select_num_topics_LSI(dictionary=dictionary, corpus=corpus, texts=df['tokenized'], end=30)
    _, lda_coherence = select_num_topics_LDA(dictionary=dictionary, corpus=corpus, texts=df['tokenized'], end=30)
    _, hdp_coherence = select_num_topics_HDP(dictionary=dictionary, corpus=corpus, texts=df['tokenized'], end=30)
    x = range(3, 31, 3)
    fig, ax = plt.subplots(figsize=(8,5))
    ax.plot(x, lda_coherence, color='r', label='lda')
    ax.plot(x, hdp_coherence, color='g', label='hdp')
    ax.plot(x, lsi_coherence, color='b', label='lsi')
    ax.legend(loc="best")
    ax.set(xlabel='Num of topics', ylabel='Coherence values')
    ax.set_title('Topic coherence for different topics for LDA, HDP, and LSI in mesa_5000')
    fig.savefig('num_topics.png')
    print('computing finished')
        
#eval_top_k_words()


In [None]:
# K parameters in HDP model 

def HDP_K(corpus, dictionary, texts, num_topics):
    model_list = []
    coherence_values = []
    
    for k in range(5, 36, 5):
        model = train_hdp(corpus=corpus, id2word=dictionary, T=num_topics, K=k, random_state=0)
        model_list.append(model)
        
        topics = []
        for topic_id, topic in model.show_topics(num_topics=num_topics, formatted=False):
            topic = [word for word, _ in topic]
            topics.append(topic)
        
        coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
    
    fig, ax = plt.subplots(figsize=(8,5))
    ax.plot(range(5, 36, 5), coherence_values)
    ax.set(xlabel='Num of words in each topic', ylabel='Coherence values')
    ax.set_title('HDP model topic coherence for different words in each topic')
    fig.savefig('HDP_num_words.png')
    
    return model_list, coherence_values


#HDP_K(corpus=corpus, dictionary=dictionary, texts=df['tokenized'], num_topics=12)

In [18]:
# Set onepass=False, tune power_iters parameter for LSI model
def tune_power_iters(dictionary, corpus, num_topics, texts):
    
    model_list = []
    coherence_values = []
    
    for num_iter in range(20, 51, 5):
        model = train_lsi(corpus=corpus, id2word=dictionary, num_topics=num_topics, onepass=False, 
                          power_iters=num_iter)
        model_list.append(model)

        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
    
    fig, ax = plt.subplots(figsize=(8,5))
    ax.plot(range(20, 51, 5), coherence_values)
    ax.set(xlabel='Num of iterations', ylabel='Coherence values')
    ax.set_title('LSI model topic coherence for different iterations')
    fig.savefig('LSI_num_iters.png')
        
    return model_list, coherence_values

# tune_power_iters(dictionary=dictionary, corpus=corpus, num_topics=6, texts=df['tokenized'])

In [20]:
# Set onepass=False, tune extra_samples parameter for LSI model
def tune_extra_samples(dictionary, corpus, num_topics, num_iter, texts):
    
    model_list = []
    coherence_values = []
    
    for num_sample in range(100, 300, 30):
        model = train_lsi(corpus=corpus, id2word=dictionary, num_topics=num_topics, onepass=False, 
                          power_iters=num_iter, extra_samples=num_sample)
        model_list.append(model)

        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
    
    fig, ax = plt.subplots(figsize=(8,5))
    ax.plot(range(100, 300, 30), coherence_values)
    ax.set(xlabel='Num of extra samples', ylabel='Coherence values')
    ax.set_title('LSI model topic coherence for different extra samples')
    fig.savefig('LSI_num_extra_samples.png')
    
    return model_list, coherence_values

#tune_extra_samples(dictionary, corpus, num_topics=6, num_iter=40, texts=df['tokenized'])

In [23]:
logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

passes = list(range(10, 51, 10))
iterations = list(range(60, 110, 10))

# Set eval_every to 1 and do grid search of passes and iterations for LDA model
def LDA_grid_search(corpus, dictionary, texts, num_topics):
    coherence_values = []
    pass_iter_pairs = []
    model_list = []
    
    for num_pass in passes:
        for num_iter in iterations:
            pass_iter_pairs.append((num_pass, num_iter))
            model = train_lda(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=num_pass,
                              iterations=num_iter, eval_every=1, random_state=0)
            model_list.append(model)
            coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
            coherence_values.append(coherence_model.get_coherence())
    
    fig, ax = plt.subplots(figsize=(8,5))
    ax.plot(range(0, len(pass_iter_pairs)), coherence_values)
    ax.set(xlabel='index of pass iter pairs', ylabel='Coherence values')
    ax.set_title('LDA coherence values of different combinations of passes and iterations')
    fig.savefig('LDA_num_passes_iters.png')
    
    return pass_iter_pairs, model_list, coherence_values



# pi_pairs, models, coherence_values = LDA_grid_search(corpus=corpus, dictionary=dictionary, texts=df['tokenized'], num_topics=12)
# print('coherence values: ', coherence_values)
# indicies = np.argsort(np.array(coherence_values))
# print('indices: ', indicies)


In [24]:
csv_name = 'mesa_5000.csv'
query = pd.read_csv(csv_name, index_col=0)
query = query.loc[query['name']=="Alessia's Ristorante Italiano"]
query['tokenized'] = query['tokenized'].apply(eval)

df = pd.read_csv('mesa_5000.csv', index_col=0)
df['tokenized'] = df['tokenized'].apply(eval)
dictionary, corpus = get_dictionary_corpus(df['tokenized'])

lsi_model = train_hdp(corpus=corpus, id2word=dictionary, T=12)
lsi_model[corpus][0]
query_bow = lsi_model.id2word.doc2bow(query.iloc[0]['tokenized'])
lsi_model[query_bow]
most_sim_ids = get_most_similar_businesses(query.iloc[0]['tokenized'], corpus, dictionary, lsi_model)
tmp = df.iloc[most_sim_ids,:]
tmp

Time to train HDP model on businesses:  0.01051390568415324 min


Unnamed: 0,business_id,name,latitude,longitude,address,stars,text,tokenized
89,IXUwuNjy707wJNW2U4sRVg,Alessia's Ristorante Italiano,33.436137,-111.716897,5251 E Brown Rd,4.5,I like Italian food. It's probably the only fo...,"[like, italian, food, food, take, home, left, ..."
271,xTW5PkLEdMBs2f2W8RGy0g,Miele's Italian and Banquet Hall,33.364794,-111.878331,"2050 W Guadalupe Rd, Ste 9",4.0,I have tried this place now twice for delivery...,"[tri, place, deliveri, pickup, pizza, delici, ..."
267,wctvZYbHAo8jufqAFQ457g,The Hub Grill and Bar,33.381584,-111.807068,1860 S Stapley Dr,4.0,Went here for lunch and had the same awesome w...,"[lunch, awesom, waitress, locat, time, ask, su..."
192,iJBnqweAPDTCfyMcRrG90w,Giant Hamburgers,33.406408,-111.771654,"2753 E Broadway Rd, Ste 104",4.5,"Amazing amazing amazing. Everyone, you must t...","[amaz, amaz, amaz, everyon, tri, small, place,..."
191,iBvF9Oy9UdOrXvTlxNHyqw,La Casa de Juana,33.393456,-111.872695,1976 W Southern Ave,4.0,I think we've got a new local favorite!\nWe've...,"[think, weve, new, local, tri, mexican, place,..."
190,iBCMaNm_hv9IlCDa7AWPig,Mango's Mexican Cafe,33.415444,-111.833145,44 W Main St,4.0,I love this spot. Best chips and salsa in the ...,"[love, spot, best, chip, east, valley, enjoy, ..."
189,i066yR2IDP4FWt6p-k9aFg,Ike's Love & Sandwiches,33.390528,-111.85576,"1130 W Grove Ave, Ste 110",4.5,OMG. My son went to Ike's in California and we...,"[omg, son, ike, california, mesa, count, mont,..."
188,hRFKKf8jBnn4paxnNUK1hA,Hooters,33.383454,-111.857178,1665 S Alma School Rd,2.5,It appears that this location has trouble coun...,"[locat, troubl, count, go, order, wing, short,..."
193,ihheHS4noJayWgECQpeJ_A,Teharu Sushi,33.385165,-111.687144,"6638 E Superstition Springs Blvd, Ste 101",3.0,I first found out about this place by word of ...,"[found, place, word, mouth, bank, month, gone,..."
0,-3oxnPPPU3YoxO9M1I2idg,Eklectic Pie - Mesa,33.379912,-111.806297,"1859 S Stapley Dr, Ste 105-3",4.0,Usually the pizza is really good and the servi...,"[pizza, good, servic, amaz, pizza, finish, hom..."


In [14]:
import pickle

def write_to_pickle():
    df = pd.read_csv('mesa_5000.csv', index_col=0)
    df['tokenized'] = df['tokenized'].apply(eval)
    dictionary, corpus = get_dictionary_corpus(df['tokenized'])
    pickle.dump(dictionary, open("mesa_dictionary.pkl","wb"))
