In [1]:
# import dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim import models, corpora, similarities
import re
from nltk.stem.porter import PorterStemmer
import time
from nltk import FreqDist
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim
from gensim.matutils import kullback_leibler, jaccard, hellinger
import logging
import pickle
import warnings
warnings.filterwarnings('ignore')

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
print('Downloads Complete')

Downloads Complete


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
def get_most_similar_documents(query, corpus, dictionary, k=10):
    distances = []
    for c in corpus:
        distances.append(kullback_leibler(query, c, num_features=len(dictionary)))
    
    indices = np.array(distances).argsort()[:k]
    return indices

def get_most_similar_businesses(query_data, corpus, dictionary, model):
    query_bow = dictionary.doc2bow(query_data)
    most_sim_ids = get_most_similar_documents(model[query_bow], model[corpus], dictionary)
    return most_sim_ids

def get_dictionary_corpus(data, no_below=5, no_above=0.1):
    dictionary = corpora.Dictionary(data)
    #dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    corpus = [dictionary.doc2bow(doc) for doc in data]
    return dictionary, corpus

In [29]:
def train_lda(corpus, id2word, chunksize=2000, num_topics=10, alpha='auto', eta='auto', passes=1, iterations=50,
              minimum_probability=0.01, eval_every=10, random_state=None):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    """
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, alpha=alpha, eta=eta, 
                   chunksize=chunksize, minimum_probability=minimum_probability, passes=passes, 
                   iterations=iterations, eval_every=eval_every, random_state=random_state)
    
    t2 = time.time()
    print("Time to train LDA model on businesses: ", (t2-t1)/60, "min")
    
    return lda

In [30]:
csv_names = ["pittsburgh_reviews_cleaned.csv", "mesa_reviews_cleaned.csv", "charlotte_reviews_cleaned.csv"]

In [31]:
for csv in csv_names:
    df = pd.read_csv(csv, index_col=0)
    df['tokenized'] = df['tokenized'].apply(eval)
    dictionary, corpus = get_dictionary_corpus(df['tokenized'])
    lda_model = train_lda(corpus=corpus, id2word=dictionary)
    pickle.dump(dictionary, open(csv+"_dictionary.pkl","wb"))
    pickle.dump(corpus,open(csv+"_corpus.pkl","wb"))
    pickle.dump(lda_model,open(csv+"_lda.pkl","wb"))
    print(csv,"done")
    

Time to train LDA model on businesses:  0.16230140527089437 min
pittsburgh_reviews_cleaned.csv done
Time to train LDA model on businesses:  0.07581223646799723 min
mesa_reviews_cleaned.csv done
Time to train LDA model on businesses:  0.16315337419509887 min
charlotte_reviews_cleaned.csv done


In [32]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

'accomod'