# Import libraries

In [325]:
# uncomment to install libraries
# ! pip install numpy pandas matplotlib seaborn nltk sklearn regex
# ! python -m spacy download en_core_web_sm
# nltk.download("stopwords")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import sklearn
import re

# English stop words
english_stop_words = nltk.corpus.stopwords.words("english")

# Define functions

## Text processing
* Special characters
* Lower case
* White space
* Tokenize with WordPunctTokenizer
* Remove English stop words
Source: [Traditional Methods for Text Data by Dipanjan (DJ) Sarkar
](https://towardsdatascience.com/understanding-feature-engineering-part-3-traditional-methods-for-text-data-f6f7d70acd41)

In [326]:
# process a single document's text
def process_document(document):
    # remove special characters
    document_processed = re.sub(pattern = "[^a-zA-Z0-9_\s]", repl = "", string = document, flags = re.IGNORECASE|re.ASCII)
    
    # lower case
    document_processed = document_processed.lower()
    
    # remove white space
    document_processed = document_processed.strip()
    
    # tokenize document
    tokens = nltk.WordPunctTokenizer().tokenize(document_processed)
    
    # filter stopwords out of document
    tokens_no_stop_words = [token for token in tokens if token not in english_stop_words]
    
    # re-create document from filtered tokens and return
    return " ".join(tokens_no_stop_words)

# vectorize process_document for processing entire corpus of documents
process_corpus = np.vectorize(process_document)

# Review data

In [327]:
escape_room_reviews = pd.read_csv("data/escape_room_reviews.csv", index_col = 0)

# See it (random 5 rows)
escape_room_reviews

Unnamed: 0,state,city_href,room_href,review_number,review,room_title,room_description,room_address
0,california,/bakersfield,/bakersfield/quests/Bakersfield-Escape-Room-RE...,0,We loved it... it was hard but we finished wit...,"Escape room ""R.E.S.C.U.E."" by Bakersfield Esca...",Description:A young girl has gone missing and ...,"3616 Coffee Rd, Suite C Bakersfield, CA 93308 ..."
1,california,/bakersfield,/bakersfield/quests/Bakersfield-Escape-Room-RE...,1,My friends and I (4 total) went for one of our...,"Escape room ""R.E.S.C.U.E."" by Bakersfield Esca...",Description:A young girl has gone missing and ...,"3616 Coffee Rd, Suite C Bakersfield, CA 93308 ..."
2,california,/bakersfield,/bakersfield/quests/Bakersfield-Escape-Room-RE...,2,The challenge was awesome. Looking forward to ...,"Escape room ""R.E.S.C.U.E."" by Bakersfield Esca...",Description:A young girl has gone missing and ...,"3616 Coffee Rd, Suite C Bakersfield, CA 93308 ..."
3,california,/bakersfield,/bakersfield/quests/Bakersfield-Escape-Room-RE...,3,We had a wonderful night. So close 5 more minu...,"Escape room ""R.E.S.C.U.E."" by Bakersfield Esca...",Description:A young girl has gone missing and ...,"3616 Coffee Rd, Suite C Bakersfield, CA 93308 ..."
4,california,/bakersfield,/bakersfield/quests/Bakersfield-Escape-Room-RE...,4,We had an absolute blast. We did RESCUE which ...,"Escape room ""R.E.S.C.U.E."" by Bakersfield Esca...",Description:A young girl has gone missing and ...,"3616 Coffee Rd, Suite C Bakersfield, CA 93308 ..."
...,...,...,...,...,...,...,...,...
4134,colorado,/steamboat-springs,/steamboat-springs/quests/the-crooked-key-flas...,2,We did the flashover! game and it was the best...,,,
4135,colorado,/steamboat-springs,/steamboat-springs/quests/the-crooked-key-flas...,3,Had a great time in the fire rescue themed roo...,,,
4136,colorado,/steamboat-springs,/steamboat-springs/quests/the-crooked-key-flas...,4,Our group did the Flashover game - for most of...,,,
4137,colorado,/steamboat-springs,/steamboat-springs/quests/the-crooked-key-flas...,5,We had the absolute best tome! We booked quite...,,,


# Review processing

## Extract reviews

## Process corpus of reviews

### Example process

In [335]:
(escape_room_reviews[["review"]]
    .assign(review_processed = process_corpus(escape_room_reviews["review"]))
    .sample(n = 5, replace = False))

Unnamed: 0,review,review_processed
2847,"WAY different from Pandora's Box, non-linear, ...",way different pandoras box nonlinear follows s...
604,So happy to have a great escape room right her...,happy great escape room right scv group played...
2939,Some of the puzzles were fairly interesting an...,puzzles fairly interesting fun concepts like c...
304,We have done about a dozen escape rooms (from ...,done dozen escape rooms 60out well couple comp...
2749,We had our 10-year-old’s birthday party there ...,10yearolds birthday party blast harder thought...


### Full process

In [336]:
reviews_processed = process_corpus(escape_room_reviews["review"])

## Bag of Words Matrix
Vectorize documents

In [345]:
# min_df and max_df
# When building the vocabulary ignore terms that have a document frequency strictly [higher/lower] than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.
bag_of_words_matrix = (sklearn.feature_extraction.text
    .CountVectorizer(min_df = 0.0, max_df = 1.0)
    .fit_transform(reviews_processed))

In [349]:
bag_of_words_matrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Bi-grams

In [143]:
# bigrams
bigram = Phrases(docs, min_count = 10)

# conatiner for tokens
tokens = []

for i in range(len(docs)):
    for token in bigram[docs[i]]:
        if "_" in token:
            # add token to docs
            docs[i].append(token)
            
            # add token to cotntainer
            tokens.append(token)

# See some
set(tokens)

{'escape_room',
 'find_way',
 'go_missing',
 'good_luck',
 'serial_killer',
 'solve_mystery',
 'solve_puzzle',
 'time_run',
 'year_ago'}

## Dictionary representations

In [144]:
dictionary = Dictionary(docs)
print("Number of unique words in original documents:", len(dictionary))

dictionary.filter_extremes(no_below = 3, no_above = 0.25)
print("Number of unique words after removing rare and common words:", len(dictionary))

# random document
doc_random = np.random.randint(0, len(docs))
print("Example representation of document {}:".format(doc_random), dictionary.doc2bow(docs[doc_random]))

Number of unique words in original documents: 4778
Number of unique words after removing rare and common words: 1767
Example representation of document 1999: [(141, 1), (895, 1), (1207, 1), (1658, 1), (1659, 1)]


## Bag of Words representations

In [145]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

## LDA

In [149]:
LdaModel_fit1 = LdaModel(corpus = corpus, id2word = dictionary, num_topics = 5, chunksize = 500, passes = 3, random_state = 34685)

In [None]:
LdaModel_fit1.

## LDA plot

In [150]:
pyLDAvis.enable_notebook()
warnings.filterwarnings("ignore", category = DeprecationWarning) 

# plot
pyLDAvis.gensim.prepare(LdaModel_fit1, corpus, dictionary, sort_topics = False)