# Import libraries

In [106]:
# uncomment to install libraries
# ! pip install numpy pandas matplotlib seaborn nltk spacy regex gensim pyLDAvis
# ! python -m spacy download en_core_web_sm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
import re
from gensim.models import Phrases, LdaModel
from gensim.corpora import Dictionary
import pyLDAvis.gensim
import warnings

# english model from spacy that needs explaining ...
nlp = spacy.load("en_core_web_sm")

# "punkt" tokenizer
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nicholasmichalak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Review data

In [86]:
escape_room_reviews = pd.read_csv("data/escape_room_reviews.csv", index_col = 0)

# See it (random 5 rows)
escape_room_reviews.sample(n = 5, replace = False)

Unnamed: 0,state,city_href,room_href,review_number,review,room_title,room_description,room_address
495,arizona,/phoenix,/phoenix/quests/escape-house-mesa-flood-the-city,2,Thank you Ryan for making our experience a gre...,"Escape room ""Flood The City"" by Escape House M...",Description:Terrorist calling himself Marcin i...,"3460 E Southern Ave #110, Mesa, AZ 85204 (Show..."
427,arizona,/phoenix,/phoenix/quests/epic-escape-game-rogue-agent,0,It was a lot of fun!!! There were definitely s...,"Escape room ""Rogue Agent"" by Epic Escape Game ...",Description:The President and other world lead...,"106 N. Central Avenue, Phoenix, AZ 85004 (Show..."
19,alabama,/auburn,/auburn/quests/auburn-escape-zones-black-beard...,3,This was such a fun surprise for my husband's ...,"Escape room ""Black Beard's Brig"" by Auburn Esc...",Description:Ahoy Matey! Your crew has been cap...,"1234 Commerce Dr Auburn, AL 36830 (Show on map)"
86,alabama,/gadsden,/gadsden/quests/beat60-the-darkness,4,"We had a great time ""beating 60"" in the serial...","Escape room ""The Darkness"" by beat60 in Gadsden",Description:You and your team awake in a dimly...,"227 Broad Street Gadsden, AL 35901 (Show on map)"
274,arizona,/phoenix,/phoenix/quests/escape-games-az-blaines-basement,1,I had a BLAST! For my first escape room - Blai...,"Escape room ""Blaine's Basement"" by Escape Game...",Description:The FBI was able to apprehend a su...,"12 N. Center Street, Suite 200 Mesa, AZ 85201 ..."


In [116]:
set(escape_room_reviews.room_title)

{'Escape room " Bates Motel" by Gulf Coast Escape Room in Orange Beach',
 'Escape room "1959 - Jack\'s Place" by The Escapeopolis Project in Fayetteville (AR)',
 'Escape room "1969 - Insurgent Attack" by The Escapeopolis Project in Fayetteville (AR)',
 'Escape room "20,000 Leagues Under the Sea" by Mystery Escape Room Tucson in Tucson',
 'Escape room "51" by Escape Rooms Mesa in Phoenix',
 'Escape room "A Clinical Trial" by A Narrow Escape in Hot Springs',
 'Escape room "A Diamond Dilemma" by Escape Room 104 in Pine Bluff',
 'Escape room "A Tale of Two Forts" by XIT Escape Room in Orange Beach',
 'Escape room "Abducted" by Escape Tuscaloosa in Tuscaloosa',
 'Escape room "Abstrusus Museum" by The Experience Escape Rooms in Centerton',
 'Escape room "Al Capone" by The Experience Escape Rooms in Centerton',
 'Escape room "Alcatraz" by Hourglass Escape Rooms in Russellville',
 'Escape room "Alcatraz" by Hourglass Escape Rooms in Searcy',
 'Escape room "Alien" by Hourglass Escape Rooms in R

# Review processing

## Extract reviews

In [87]:
reviews = escape_room_reviews["review"].tolist()

## Lower case text

In [88]:
reviews_lower = [text.lower() for text in reviews]

## Sentences

In [89]:
sentences = [nltk.tokenize.sent_tokenize(text) for text in reviews_lower]

# unpack sentences
sentences_unlist = [sentence for sub_sentences in sentences for sentence in sub_sentences]

## spacy docs/generator

In [93]:
spacy_docs = list(nlp.pipe(sentences_unlist))

## Remove stop words and words shorter than 2 characters

In [101]:
docs = [[text.lemma_ for text in doc if len(text.orth_) > 2 and not text.is_stop] for doc in spacy_docs]

# See some
for i in np.random.randint(low = 0, high = len(docs), size = 5):
    print(docs[i])
    print("\n")

['escape', 'room', 'challenge', 'friend', 'join', 'previous', 'experience']


['blast', 'control', 'master', 'tony', 'awesome']


['explain']


['game', 'master', 'janey', 'fun', 'professional']


['staff', 'try', 'enjoyable']




## Bi-grams

In [110]:
# bigrams
bigram = Phrases(docs, min_count = 10)

# conatiner for tokens
tokens = []

for i in range(len(docs)):
    for token in bigram[docs[i]]:
        if "_" in token:
            # add token to docs
            docs[i].append(token)
            
            # add token to cotntainer
            tokens.append(token)

# See some
set(tokens)

{'absolute_blast',
 'absolute_blast_absolute_blast',
 'birthday_party',
 'birthday_party_birthday_party',
 'customer_service',
 'customer_service_customer_service',
 'date_night',
 'date_night_date_night',
 'feel_like',
 'feel_like_feel_like',
 'friendly_helpful',
 'friendly_helpful_friendly_helpful',
 'game_master',
 'game_master_game_master',
 'highly_recommend',
 'highly_recommend_highly_recommend',
 'look_forward',
 'look_forward_look_forward',
 'mad_hatter',
 'mad_hatter_mad_hatter',
 'minute_spare',
 'minute_spare_minute_spare',
 'second_leave',
 'second_leave_second_leave',
 'second_spare',
 'second_spare_second_spare',
 'staff_friendly',
 'staff_friendly_staff_friendly',
 'team_building',
 'team_building_team_building',
 'wait_try',
 'wait_try_wait_try',
 'year_old',
 'year_old_year_old'}

## Dictionary representations

In [112]:
dictionary = Dictionary(docs)
print("Number of unique words in original documents:", len(dictionary))

dictionary.filter_extremes(no_below = 3, no_above = 0.25)
print("Number of unique words after removing rare and common words:", len(dictionary))

# random document
doc_random = np.random.randint(0, len(docs))
print("Example representation of document {}:".format(doc_random), dictionary.doc2bow(docs[doc_random]))

Number of unique words in original documents: 2666
Number of unique words after removing rare and common words: 1016
Example representation of document 4256: [(138, 1), (242, 1), (249, 1)]


## Bag of Words representations

In [113]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

## LDA

In [114]:
LdaModel_fit1 = LdaModel(corpus = corpus, id2word = dictionary, num_topics = 3, chunksize = 500, passes = 3, random_state = 34685)

## LDA plot

In [115]:
pyLDAvis.enable_notebook()
warnings.filterwarnings("ignore", category = DeprecationWarning) 

# plot
pyLDAvis.gensim.prepare(LdaModel_fit1, corpus, dictionary, sort_topics = False)