In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt
from modeling import *

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df = pd.read_csv("Reviews_cleaned_for_NLP.csv")

In [3]:
df.columns

Index(['attraction_name', 'attraction_id', 'user_name', 'user_profile_link',
       'review_date', 'helpful_votes', 'rating', 'review_link', 'review_text',
       'review_title', 'experience_date', 'review_clean', 'review_temp',
       'review_lemma'],
      dtype='object')

## Vectorizer & Topic Modeling

The cleaned corpus is used for for creating a bag of words with TF-IDF
This will then be used for topic modeling.

In [5]:
# stop_words = ENGLISH_STOP_WORDS.union(['yosemite'])

tfidf = TfidfVectorizer(stop_words='english', min_df = 0.0001)
review_word_matrix = tfidf.fit_transform(df['review_lemma'])
review_vocab = tfidf.get_feature_names()


In [20]:
# nmf_model, err, _, _ = nmf_topic_modeling(review_word_matrix, vocab = review_vocab, 10)

A loop is used to generate # of topics using NMF & Truncated SVD

In [8]:
nmf_dict = {'nmf': [],'error': [], 'topic_matrix': [], 'word_matrix': []}

for i in range(0, 30):

    nmf, err, topic_matrix, word_matrix = nmf_topic_modeling(review_word_matrix, review_vocab, i+1)

    topic_matrix[['raw_review','review_cleanned']] = df[['review_text','review_lemma']]
    
    nmf_dict['nmf'].append(nmf)
    nmf_dict['error'].append(err)
    nmf_dict['topic_matrix'].append(topic_matrix)
    nmf_dict['word_matrix'].append(word_matrix)
    
    print(f"Progress Update On NMF Components: {i+1} currently.")

In [9]:
lsa_dict = {'lsa': [],'error': [], 'topic_matrix': [], 'word_matrix': []}

for i in range(0, 30):

    lsa, err, topic_matrix, word_matrix = lsa_topic_modeling(review_word_matrix, review_vocab, i+1)

    topic_matrix[['raw_review','review_cleanned']] = df[['review_text','review_lemma']]
    
    lsa_dict['lsa'].append(lsa)
    lsa_dict['error'].append(err)
    lsa_dict['topic_matrix'].append(topic_matrix)
    lsa_dict['word_matrix'].append(word_matrix)
    
    print(f"Progress Update On LSA Components:  {i+1} currently.")

We can now explore the topics & see if anything senseful is showing up.
To start with, we will look at just the top 12 words under each topic, when reduced to a 10 topic dimensional space, for NMF & LSA.

In [15]:
n_topics = 10
n_words = 12

top_words_for_all_topics(nmf_dict['word_matrix'][n_topics-1], n_topics, n_words)

top_reviews(nmf_dict['topic_matrix'][n_topics-1], 'topic_1', 5)

Topic 0
trail, mist, vernal, step, nevada, muir, john, fall, wet, way, 

Topic 1
point, glacier, bus, parking, drive, road, tour, view, mile, sentinel, 

Topic 2
fall, low, upper, water, yosemite, spring, flow, base, vernal, dry, 

Topic 3
drive, pass, road, tioga, lake, stop, open, snow, meadow, scenery, 

Topic 4
hike, strenuous, mile, easy, worth, steep, great, short, hour, falls, 

Topic 5
park, yosemite, visit, national, place, day, waterfall, good, beautiful, valley, 

Topic 6
dome, half, valley, el, capitan, floor, view, yosemite, climber, look, 

Topic 7
walk, easy, short, low, path, waterfall, area, base, falls, nice, 

Topic 8
water, lot, time, make, people, early, just, sure, beautiful, climb, 

Topic 9
view, worth, amazing, drive, great, spectacular, place, long, valley, good, 



array(["we saw this fall when we went to the Glacier point, it's already very beautiful when we saw in the in Glacier point, you can also see it at the Washburn point (which is a spot on the Glacier point road), there is a trail leading to the falls, you may do it if you have time.",
       "We spent the day hiking around Glacier Point.  The Point itself is the one place up the Glacier Point road that you don't have to do a long hike to get to.  It is a relatively short walk to the Point and to the reward of a fabulous view.   Amazingly it was not crowded at around 10 am on a Monday.  There are a number of vantage points to enjoy the view.  Definitely take the time to get a different view of Yosemite and the valley.  There are bathrooms in the parking lot.You can hike to Sentinel Dome from Glacier point.  We chose to drive down to the Taft Point / Sentinel Dome Trail head to hike to Sentinel Point and then Taft Point.  It is a but longer of a hike to Sentinel Dome from the Taft Point t

**Must remove 'yosemite' and perhaps club together all Proper nouns.
May get rid of them?**

Look at different POS and perhaps consider keeping only some when lemmatizing