In [1]:
# config
csv_name = 'reviews_pittsburgh.csv'
k = 15000
num_topics = 5

In [None]:
# import dependencies
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
import re
from nltk.stem.porter import PorterStemmer
import time
from nltk import FreqDist
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models.coherencemodel import CoherenceModel
sns.set_style("darkgrid")
nltk.download('stopwords')
nltk.download('punkt')
print('Downloads Complete')

In [0]:
def initial_clean(text):
    """
    Function to clean text of websites, email addresess and any punctuation
    We also lower case the text
    """
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower() # lower case the text
    text = nltk.word_tokenize(text)
    return text

stop_words = stopwords.words('english')
def remove_stop_words(text):
    """
    Function that removes all stopwords from text
    """
    return [word for word in text if word not in stop_words]

stemmer = PorterStemmer()
def stem_words(text):
    """
    Function to stem words, so plural and singular are treated the same
    """
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # make sure we have no 1 letter words
    except IndexError: # the word "oed" broke this, so needed try except
        pass
    return text

def apply_all(text):
    """
    This function applies all the functions above into one
    """
    return stem_words(remove_stop_words(initial_clean(text)))

In [0]:
def train_lda(data):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    We also do 2 passes of the data since this is a small dataset, so we want the distributions to stabilize
    """
    chunksize = 300
    dictionary = corpora.Dictionary(data['tokenized'])
    corpus = [dictionary.doc2bow(doc) for doc in data['tokenized']]
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                   alpha=1e-2, eta=0.5e-2, chunksize=chunksize, minimum_probability=0.0, passes=2)
    t2 = time.time()
    print("Time to train LDA model on ", len(df), "articles: ", (t2-t1)/60, "min")
    return dictionary,corpus,lda

In [0]:
def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for a document)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of documents in the corpus
    """
    # lets keep with the p,q notation above
    p = query[None,:].T # take transpose
    q = matrix.T # transpose matrix
    m = 0.5*(p + q)
    return np.sqrt(0.5*(entropy(p,m) + entropy(q,m)))

In [0]:
def get_most_similar_documents(query,matrix,k=10):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sims = jensen_shannon(query,matrix) # list of jensen shannon distances
    return sims.argsort()[:k] # the top k positional index of the smallest Jensen Shannon distances

In [0]:
# format the columns
df = pd.read_csv(csv_name)
df = df.groupby(['name'])['text'].apply(' '.join).reset_index()
df = df[df['text'].map(type) == str]
df.dropna(axis=0, inplace=True, subset=['text'])

# shuffle the data
df = df.sample(frac=1.0)
df.reset_index(drop=True,inplace=True)

# preprocess the text and business name and create new column "tokenized"
t1 = time.time()
df['tokenized'] = df['text'].apply(apply_all) + df['name'].apply(apply_all)
t2 = time.time()
print("Time to clean and tokenize", len(df), "businesses' reviews:", (t2-t1)/60, "min")

In [0]:
# first get a list of all words
all_words = [word for item in list(df['tokenized']) for word in item]

# use nltk fdist to get a frequency distribution of all words
fdist = FreqDist(all_words)

# define a function only to keep words in the top k words
top_k_words,_ = zip(*fdist.most_common(k))
top_k_words = set(top_k_words)
def keep_top_k_words(text):
    return [word for word in text if word in top_k_words]
df['tokenized'] = df['tokenized'].apply(keep_top_k_words)

In [0]:
# declare the training and test sets
train_df, test_df = train_test_split(df, test_size=0.2)

In [0]:
# create a mask of binary values
msk = np.random.rand(len(df)) < 0.9

# declare the training and test sets
train_df = df[msk]
train_df.reset_index(drop=True,inplace=True)

test_df = df[~msk]
test_df.reset_index(drop=True,inplace=True)

# train the topic model
dictionary,corpus,lda = train_lda(train_df)

In [0]:
test_df[test_df['name'] == 'In-N-Out Burger']

In [0]:
# select and article at random from test_df
random_article_index = np.random.randint(len(test_df))

# get the ids of the most similar businesses
new_bow = dictionary.doc2bow(test_df.iloc[random_article_index,2])
new_doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=new_bow)])
doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in lda[corpus]])
most_sim_ids = get_most_similar_documents(new_doc_distribution,doc_topic_dist)

# print the results
most_similar_df = train_df[train_df.index.isin(most_sim_ids)]
print('Similar to "{}": \n{}'.format(test_df['name'][random_article_index], most_similar_df['name'].reset_index(drop=True)))

In [0]:
# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda, texts=train_df['tokenized'].tolist(), dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)