# Import libraries

In [117]:
# uncomment to install libraries
# ! pip install numpy pandas matplotlib seaborn nltk spacy regex gensim pyLDAvis
# ! python -m spacy download en_core_web_sm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
import re
from gensim.models import Phrases, LdaModel
from gensim.corpora import Dictionary
import pyLDAvis.gensim
import warnings

# english model from spacy that needs explaining ...
nlp = spacy.load("en_core_web_sm")

# "punkt" tokenizer
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nicholasmichalak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Review data

In [118]:
escape_room_reviews = pd.read_csv("data/escape_room_reviews.csv", index_col = 0)

# See it (random 5 rows)
escape_room_reviews.sample(n = 5, replace = False)

Unnamed: 0,state,city_href,room_href,review_number,review,room_title,room_description,room_address
475,california,/los-angeles,/los-angeles/quests/the-prism-escape-rooms-gro...,1,We did Ground Zero and it was awesome! Our gro...,"Escape room ""Ground Zero"" by The Prism Escape ...",Description:An unknown virus has thrown the wo...,"17151 Newhope St #108 Fountain Valley, CA 9270..."
611,california,/los-angeles,/los-angeles/quests/esc-escape-rooms-nuclear-o...,2,SO FUN!!!! Its was kinda sketchy at first then...,"Escape room ""Nuclear Option"" by ESC Escape Roo...","Description:Escape rooms, in addition to being...","11246 W Magnolia Blvd North Hollywood, CA 9160..."
2364,new york,/buffalo,/buffalo/quests/omega-escape-room-billys-night...,0,Staff was great! Great set up! We had fun!,,,
1936,california,/san-jose,/san-jose/quests/Omescape-Room-Escape-SF-KINGD...,5,So much fun! Great team building! Can't wait t...,"Escape room ""Kingdom of Cats"" by Omescape in S...",Description:Austin Schrodinger rules the Kingd...,"625 Wool Creek Drive, Suite E, San Jose, CA 95..."
3958,colorado,/denver,/denver/quests/puzzah-the-curse,0,Puzzah at Flatirons is AWFUL. Puzzles aren’t ...,,,


In [119]:
set(escape_room_reviews.room_title)

{'Escape room " Captive" by Twisted Exit in Lancaster (CA)',
 'Escape room "13th Room" by The 13th Room in Los Angeles',
 'Escape room "221B Baker Street" by Beat the Room in Sacramento',
 'Escape room "303 Coulrophobia Ln." by Escape the Place in Orange County',
 'Escape room "A New Era" by Suite 201 Escape Room in Los Angeles',
 'Escape room "A Night in the Fashion Store" by That Escape Room in San Francisco',
 'Escape room "A Winter Fall" by Bakersfield Escape Room in Bakersfield',
 'Escape room "A Wrinkle of Wormhole" by That Escape Room in San Francisco',
 'Escape room "Abandoned Basement" by The Hour Room in San Diego',
 'Escape room "Aftermath" by Method of Escape in Fresno',
 'Escape room "AgXscape" by Hilmar Cheese Company Visitor Center in Modesto',
 'Escape room "Agent" by Escape Hotel in Los Angeles',
 'Escape room "Alcatraz" by EscapeIQ in Los Angeles',
 'Escape room "Alcatraz: The Breakout" by Merlin Entertainments in San Francisco',
 'Escape room "Alice & the White Rabbi

# Review processing

## Extract reviews

In [136]:
descriptions = escape_room_reviews["room_description"].str.replace("Description:", "").unique().tolist()

## Lower case text

In [139]:
descriptions_lower = [str(text).lower() for text in descriptions]

## Sentences

In [140]:
sentences = [nltk.tokenize.sent_tokenize(text) for text in descriptions_lower]

# unpack sentences
sentences_unlist = [sentence for sub_sentences in sentences for sentence in sub_sentences]

## spacy docs/generator

In [141]:
spacy_docs = list(nlp.pipe(sentences_unlist))

## Remove stop words and words shorter than 2 characters

In [142]:
docs = [[text.lemma_ for text in doc if len(text.orth_) > 2 and not text.is_stop] for doc in spacy_docs]

# See some
for i in np.random.randint(low = 0, high = len(docs), size = 5):
    print(docs[i])
    print("\n")

['friend', 'minute', 'find', 'clue', 'solve', 'puzzle', 'escape', 'room', 'ark', 'killer', 'return']


['passing', 'legendary', 'knight', 'melkor', 'disappearance', 'sword', 'great', 'curse', 'fall', 'town']


['work', 'team', 'pass', 'series', 'magical', 'puzzle', 'order', 'escape', 'spell', 'time']


['year', '1969', 'midst', 'cold', 'war', 'united', 'states', 'soviet', 'union', 'race', 'man', 'moon', 'government', 'fund', 'covert', 'study', 'time', 'travel', 'kronos', 'initiative']


['friend', 'look', 'forward', 'year', 'unfortunately', 'stick', 'detention']




## Bi-grams

In [143]:
# bigrams
bigram = Phrases(docs, min_count = 10)

# conatiner for tokens
tokens = []

for i in range(len(docs)):
    for token in bigram[docs[i]]:
        if "_" in token:
            # add token to docs
            docs[i].append(token)
            
            # add token to cotntainer
            tokens.append(token)

# See some
set(tokens)

{'escape_room',
 'find_way',
 'go_missing',
 'good_luck',
 'serial_killer',
 'solve_mystery',
 'solve_puzzle',
 'time_run',
 'year_ago'}

## Dictionary representations

In [144]:
dictionary = Dictionary(docs)
print("Number of unique words in original documents:", len(dictionary))

dictionary.filter_extremes(no_below = 3, no_above = 0.25)
print("Number of unique words after removing rare and common words:", len(dictionary))

# random document
doc_random = np.random.randint(0, len(docs))
print("Example representation of document {}:".format(doc_random), dictionary.doc2bow(docs[doc_random]))

Number of unique words in original documents: 4778
Number of unique words after removing rare and common words: 1767
Example representation of document 1999: [(141, 1), (895, 1), (1207, 1), (1658, 1), (1659, 1)]


## Bag of Words representations

In [145]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

## LDA

In [146]:
LdaModel_fit1 = LdaModel(corpus = corpus, id2word = dictionary, num_topics = 3, chunksize = 500, passes = 3, random_state = 34685)

## LDA plot

In [147]:
pyLDAvis.enable_notebook()
warnings.filterwarnings("ignore", category = DeprecationWarning) 

# plot
pyLDAvis.gensim.prepare(LdaModel_fit1, corpus, dictionary, sort_topics = False)