# **Importing Packages**

In [None]:
pip install pyLDAvis

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |▏                               | 10kB 22.7MB/s eta 0:00:01[K     |▍                               | 20kB 22.0MB/s eta 0:00:01[K     |▋                               | 30kB 17.4MB/s eta 0:00:01[K     |▉                               | 40kB 15.6MB/s eta 0:00:01[K     |█                               | 51kB 12.4MB/s eta 0:00:01[K     |█▏                              | 61kB 12.6MB/s eta 0:00:01[K     |█▍                              | 71kB 12.3MB/s eta 0:00:01[K     |█▋                              | 81kB 12.4MB/s eta 0:00:01[K     |█▉                              | 92kB 11.9MB/s eta 0:00:01[K     |██                              | 102kB 12.9MB/s eta 0:00:01[K     |██▎                             | 112kB 12.9MB/s eta 0:00:01[K     |██▍                             | 122kB 12.9MB/s eta

In [None]:
# Run in terminal or command prompt
# python3 -m spacy download en

import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
from gensim.utils import simple_preprocess

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
#import pyLDAvis
#import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# **Load the Reviews Corpus**

In [None]:
# Import Dataset
data = pd.read_csv(r"/content/drive/MyDrive/Capstone/locations_reviews_summary#3.csv", encoding = 'iso-8859-1')
data.rename(columns = {'Place Name':'location_name', 'summary':'reviews'}, inplace = True)
data.drop_duplicates('location_name', inplace=True) 
data.head()

Unnamed: 0,location_name,reviews
0,Visvesvaraya Industrial and Technological Museum,You can expect great experience for adults and...
1,Madhya Pradesh Tribal Museum,A must visit when you are in Bhopal Extremely ...
2,Jantar Mantar - Jaipur,The local guide was very fluent in English and...
3,Tea Gardens,They provide dress also on rent for photos Mun...
4,Indira Gandhi Tulip Garden,Indira Gandhi tulip garden is located in Srina...


In [None]:
data.loc[345:355]

Unnamed: 0,location_name,reviews
345,Second Hooghly Bridge,One can take a better view of this place from ...
346,Church of our Lady of Velankanni,The Velankanni Church was originally built by ...
347,Mount Mary Basilica,"With reagrds , Youngest_traveller_ Â The Bas..."
348,Vallarpadam Church,Nice to view from outside and you may respectf...
349,St. Aloysius Chapel,St. Aloysius Chapel Mangalore is very beautifu...
350,St. Alex Church,located in north goa in arpora calangute built...
351,Church of St. Francis of Assissi,This majestic Church was built by the Portugue...
352,St. Mary's Basilica,But when i visited the church in India i felt ...
353,Elefantastic,I researched many elephant experiences in Indi...
354,Elefanjoy,We met Ankit and learned about Elefanjoy on ou...


In [None]:
data[data.location_name == 'EleSafari']

Unnamed: 0,location_name,reviews
358,EleSafari,A fun filled day with the elephants at Elesafa...


# **Text cleaning and Tokenization**

In [None]:
def default_clean(text):
    '''
    Removes default bad characters
    '''
    if not (pd.isnull(text)):
    # text = filter(lambda x: x in string.printable, text)
      bad_chars = set(["@", "+", '/', "'", '"', '\\','(',')', '\\n', '?', '#', ',','.', '[',']', '%', '$', '&', ';', '!', ':',"*", "_", "=", "}", "{"])
    for char in bad_chars:
        text = text.replace(char, " ")
    text = re.sub('\d+', "", text)
    return text.lower()

In [None]:
df = data.copy()
df['reviews'] = df['reviews'].apply(default_clean)

In [None]:
df['reviews'][0]

'you can expect great experience for adults and kids in equal measure at this place  so many things to learn and understand good to start a science spark for kids  good for one full day if kids in school are going  this place can be consider has knowledge oriented place which gives more information on the science and this place is good for kids and grownups  from the very basic of physical objects to more complex versions  this museum lodges some of the most significant scientific discoveries explained in simple easy to understand ways  very good museum for school going students with nice introduction to science school going students can learn the science effectively especially physics and astro physics multiple science related objects and to understand the complete details mentioned there  it also helps to improve our knowledge on other things the museum is truly designed for all age groups with different education backgrounds  its specially recommended for students of all age groups 

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(df['reviews']))

print(data_words[:1])

[['you', 'can', 'expect', 'great', 'experience', 'for', 'adults', 'and', 'kids', 'in', 'equal', 'measure', 'at', 'this', 'place', 'so', 'many', 'things', 'to', 'learn', 'and', 'understand', 'good', 'to', 'start', 'science', 'spark', 'for', 'kids', 'good', 'for', 'one', 'full', 'day', 'if', 'kids', 'in', 'school', 'are', 'going', 'this', 'place', 'can', 'be', 'consider', 'has', 'knowledge', 'oriented', 'place', 'which', 'gives', 'more', 'information', 'on', 'the', 'science', 'and', 'this', 'place', 'is', 'good', 'for', 'kids', 'and', 'grownups', 'from', 'the', 'very', 'basic', 'of', 'physical', 'objects', 'to', 'more', 'complex', 'versions', 'this', 'museum', 'lodges', 'some', 'of', 'the', 'most', 'significant', 'scientific', 'discoveries', 'explained', 'in', 'simple', 'easy', 'to', 'understand', 'ways', 'very', 'good', 'museum', 'for', 'school', 'going', 'students', 'with', 'nice', 'introduction', 'to', 'science', 'school', 'going', 'students', 'can', 'learn', 'the', 'science', 'effect

In [None]:
# NLTK Stop words
import nltk
nltk.download('popular')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
new_stopwords = ['place','visit','good','must','also','know','walk','nice','time','great','take','still','would','like','stay','view','start','make','look','work']
stop_words.extend(new_stopwords)
new_stopwords_list = list(set(stop_words))

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print('\n',trigram_mod[bigram_mod[data_words[0]]])




 ['you', 'can', 'expect', 'great', 'experience', 'for', 'adults', 'and', 'kids', 'in', 'equal', 'measure', 'at', 'this', 'place', 'so', 'many', 'things', 'to', 'learn', 'and', 'understand', 'good', 'to', 'start', 'science', 'spark', 'for', 'kids', 'good', 'for', 'one', 'full', 'day', 'if', 'kids', 'in', 'school', 'are', 'going', 'this', 'place', 'can', 'be', 'consider', 'has', 'knowledge', 'oriented', 'place', 'which', 'gives', 'more', 'information', 'on', 'the', 'science', 'and', 'this', 'place', 'is', 'good', 'for', 'kids', 'and', 'grownups', 'from', 'the', 'very', 'basic', 'of', 'physical', 'objects', 'to', 'more', 'complex', 'versions', 'this', 'museum', 'lodges', 'some', 'of', 'the', 'most', 'significant', 'scientific', 'discoveries', 'explained', 'in', 'simple', 'easy', 'to', 'understand', 'ways', 'very', 'good', 'museum', 'for', 'school', 'going', 'students', 'with', 'nice', 'introduction', 'to', 'science', 'school', 'going', 'students', 'can', 'learn', 'the', 'science', 'effec

In [None]:
type(list(new_stopwords_list))

list

In [None]:
# Function for stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in new_stopwords_list] for doc in texts]

# Function for bigrams
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# Function for trigrams
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

# **Lemmatization**

In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        lemma_list = [token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags and '_' not in str(token)]
        lemma_list_nostopwords = [token for token in lemma_list if token not in new_stopwords_list]
        texts_out.append(" ".join(lemma_list_nostopwords))
    return texts_out


In [None]:
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatization(text, lemmer = WordNetLemmatizer()):
    '''
    Removes stopwords and does lemmatization
    '''
    text_out = []
    for word_list in text:
      text_lemmatized = []
      for word in word_list:
        if '_' not in word and len(word) > 3:
          text_lemmatized.append(lemmer.lemmatize(word))
        elif '_' in word:
          #ngram_word = word.replace('_', ' ')
          text_lemmatized.append(word)
      text_out.append(text_lemmatized)
    
    #text = ' '.join(text_lemmatized)
    return text_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Form Bigrams
data_words_trigrams = make_trigrams(data_words_bigrams)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
#data_lemmatized = lemmatization(data_words_trigrams)

print(data_lemmatized[:2])

['expect experience adult kid equal measure many thing learn understand science spark kid full day kid school go consider knowledge orient give information science kid grownup basic physical object complex version museum lodge significant scientific discovery explain simple easy school go student introduction science school go student learn science effectively especially physics multiple science relate object understand complete detail mention help improve knowledge thing museum truly design different education background specially recommend student trigger scientific temperament rolling machine contraption installation spread floor big attraction technological museum vitm open use museum explain science several everyday use thing pulley lock ball pen exhibit button help kid high especially kid beneficial student people interested science technology museum well maintain exhibit condition school child inculcate interest science young age old family kid amaze child bring curiosity fun sc

# **Document-Word Matrix**

In [None]:
data_lemmatized_copy = data_lemmatized.copy()

data_lemmatized = [" ".join(doc) for doc in data_lemmatized]
data_lemmatized[:5]

In [None]:
len(data_lemmatized)

In [None]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{4,}'  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)
data_vectorized.shape

(435, 5181)

# Checking the Data Sparsity

In [None]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  14.049389125163339 %


# **LDA Model**

In [None]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=9,               # Number of topics
                                      max_iter=500,                  # Max learning iterations
                                      learning_method='online',
                                      learning_decay=0.7,   
                                      random_state=100,              # Random state
                                      batch_size=20,                # n docs in each learning iter
                                      evaluate_every = -1,           # compute perplexity every n iters, default: Don't
                                      n_jobs = -1                    # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=20, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=500,
                          mean_change_tol=0.001, n_components=9, n_jobs=-1,
                          perp_tol=0.1, random_state=100, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)


In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters
print(lda_model.get_params())

Log Likelihood:  -9768135.240343763
Perplexity:  791.3076361586329
{'batch_size': 20, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.7, 'learning_method': 'online', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 500, 'mean_change_tol': 0.001, 'n_components': 9, 'n_jobs': -1, 'perp_tol': 0.1, 'random_state': 100, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


In [None]:
import pickle

# Save LDA Model
with open('/content/drive/MyDrive/Colab Notebooks/optimal_LDA_Model.pkl', 'wb') as f:
  pickle.dump(lda_model, f)

# Save LDA Model count vectorizer
with open('/content/drive/MyDrive/Colab Notebooks/optimal_vectorizer.pkl', 'wb') as f:
  pickle.dump(vectorizer, f)

In [None]:
# Define Search Param
search_params = {'n_components': [5, 6, 7, 8, 9, 10], 'learning_decay': [.7, .9],
                 'max_iter':[50,100]}

# Init the Model
lda = LatentDirichletAllocation(learning_method='online',
                                  random_state=100,              # Random state
                                  batch_size=128,                # n docs in each learning iter
                                  evaluate_every = -1,           # compute perplexity every n iters, default: Don't
                                  n_jobs = -1                    # Use all available CPUs
                                  )

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)

GridSearchCV(cv=None, error_score=nan,
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='online',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=-1,
                                                 perp_tol=0.1, random_state=100,
                                                 topic_word_prior=None,
                                                 total_samples=1000000.0,
                              

In [None]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.7, 'max_iter': 100, 'n_components': 5}
Best Log Likelihood Score:  -2250285.846024356
Model Perplexity:  867.3036619136184


In [None]:
import pickle

# Load LDA Model
with open('/content/drive/MyDrive/Colab Notebooks/LDA_Model#1.pkl', 'rb') as f:
  lda_model = pickle.load(f)

# Load Vectorizer
with open('/content/drive/MyDrive/Colab Notebooks/Vectorizer#1.pkl', 'rb') as f:
  vectorizer = pickle.load(f)

In [None]:
best_lda_model=lda_model

In [None]:
best_lda_model_copy = best_lda_model

In [None]:
import pickle

# Save LDA Model
with open('/content/drive/MyDrive/Colab Notebooks/LDA_Model#2_GridSearchCV.pkl', 'wb') as f:
  pickle.dump(best_lda_model, f)

In [None]:

# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

# column names
#topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
topicnames = ['Monuments/Historical Architectures/Spiritual Attractions', 'Mountains/Landscapes/Waterfalls', 'Beach/Seashores', 'Temples/Church/Worship', 
              'Wildlife/Forests/National Parks', 'Gardens/City Parks', 'Palace/Forts/Ancient Buildings', 'Museums/Indian Culture & History', 'Riverbank/Pilgrimage']
# index names
#docnames = ["Doc" + str(i) for i in range(len(data))]
docnames = data['location_name'].values.tolist()

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames)


# Get dominant topic for each document
dominant_topic_index = np.argmax(df_document_topic.values, axis=1)
dominant_topic = [topicnames[idx] for idx in dominant_topic_index]
dominant_topic_prob = df_document_topic.max(axis=1).values.tolist()
df_document_topic['dominant_topic'] = dominant_topic
df_document_topic['probability'] = dominant_topic_prob
df_document_topic['Place Name'] = docnames

# Styling
def color_green(val):
  if val >= 1:
    color = 'red'
  elif val > .1 and val < 1:
    color = 'green'
  else:
    color = 'black'
  return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.style.applymap(color_green).applymap(make_bold)
df_document_topic = df_document_topic[['Place Name'] + topicnames[:] + ['dominant_topic', 'probability']]
df_document_topic

Unnamed: 0,Place Name,Monuments/Historical Architectures/Spiritual Attractions,Mountains/Landscapes/Waterfalls,Beach/Seashores,Temples/Church/Worship,Wildlife/Forests/National Parks,Gardens/City Parks,Palace/Forts/Ancient Buildings,Museums/Indian Culture & History,Riverbank/Pilgrimage,dominant_topic,probability
0,Visvesvaraya Industrial and Technological Museum,0.00,0.00,0.00,0.00,0.15,0.10,0.00,0.75,0.00,Museums/Indian Culture & History,0.75
1,Madhya Pradesh Tribal Museum,0.07,0.00,0.00,0.01,0.00,0.07,0.00,0.83,0.01,Museums/Indian Culture & History,0.83
2,Jantar Mantar - Jaipur,0.62,0.00,0.00,0.00,0.06,0.00,0.07,0.24,0.00,Monuments/Historical Architectures/Spiritual A...,0.62
3,Tea Gardens,0.00,0.66,0.00,0.00,0.03,0.26,0.01,0.03,0.00,Mountains/Landscapes/Waterfalls,0.66
4,Indira Gandhi Tulip Garden,0.00,0.25,0.00,0.00,0.01,0.74,0.00,0.00,0.00,Gardens/City Parks,0.74
...,...,...,...,...,...,...,...,...,...,...,...,...
430,KidZania Delhi NCR,0.00,0.03,0.04,0.15,0.61,0.15,0.00,0.03,0.00,Wildlife/Forests/National Parks,0.61
431,Kedarnath Mandir,0.01,0.57,0.00,0.41,0.00,0.00,0.00,0.00,0.01,Mountains/Landscapes/Waterfalls,0.57
432,Vijaya Vittala Temple,0.94,0.02,0.00,0.00,0.02,0.00,0.00,0.02,0.01,Monuments/Historical Architectures/Spiritual A...,0.94
433,Shatrunjaya hill temple,0.35,0.35,0.00,0.30,0.00,0.01,0.00,0.00,0.00,Monuments/Historical Architectures/Spiritual A...,0.35


In [None]:
df_document_topic.to_pickle("/content/drive/MyDrive/Capstone/topic_distribution#1.pkl")

# Updating the metadata data 

In [None]:
metadata = pd.read_csv("/content/drive/MyDrive/Capstone/metadata_final#2.csv", encoding='latin-1')
metadata

Unnamed: 0,Place Name,State,District,City,img_source,Link
0,Galgibaga Beach,Goa,South Goa District,Agonda,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...
1,Agra Fort,Uttar Pradesh,Agra District,Agra,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...
2,Tomb of Itimad-ud-Daulah,Uttar Pradesh,Agra District,Agra,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...
3,Sheesh Mahal,Uttar Pradesh,Agra District,Agra,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...
4,Imperial Wax Museum,Uttar Pradesh,Agra District,Agra,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...
...,...,...,...,...,...,...
430,TU 142 Air Craft Museum,Andhra Pradesh,Visakhapatnam District,Visakhapatnam (Vizag),https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...
431,Chembra Peak,Kerala,Wayanad District,Kalpetta,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...
432,Dawki River,Meghalaya,West Jaintia Hills District,Dawki,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...
433,Krang Suri Falls,Meghalaya,West Jaintia Hills District,Jowai,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...


In [None]:
df_document_topic = df_document_topic.drop_duplicates(['Place Name'])

metadata_final = pd.merge(left= metadata , right = df_document_topic[['Place Name', 'dominant_topic', 'probability']] , on="Place Name", how="left").reset_index(drop = True)
metadata_final.head()

Unnamed: 0,Place Name,State,District,City,img_source,Link,dominant_topic,probability
0,Galgibaga Beach,Goa,South Goa District,Agonda,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...,Beach/Seashores,0.83
1,Agra Fort,Uttar Pradesh,Agra District,Agra,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...,Palace/Forts/Ancient Buildings,0.57
2,Tomb of Itimad-ud-Daulah,Uttar Pradesh,Agra District,Agra,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...,Palace/Forts/Ancient Buildings,0.34
3,Sheesh Mahal,Uttar Pradesh,Agra District,Agra,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...,Palace/Forts/Ancient Buildings,0.67
4,Imperial Wax Museum,Uttar Pradesh,Agra District,Agra,https://media-cdn.tripadvisor.com/media/photo-...,https://www.tripadvisor.in/Attraction_Review-g...,Gardens/City Parks,0.6


In [None]:
metadata_final.to_csv("/content/drive/MyDrive/Capstone/metadata_final_topics.csv")

In [None]:
df_document_topic.loc[df_document_topic['Place Name'].isin(['Agaya Gangai Waterfalls','Sanchi Stupas','Dilwara Jain Temples','Pench Tiger Reserve', 'Sela Pass']),
                      ['Place Name','dominant_topic', 'probability']]


Unnamed: 0,Place Name,dominant_topic,probability
124,Sanchi Stupas,Monuments/Historical Architectures/Spiritual A...,0.73
231,Agaya Gangai Waterfalls,Mountains/Landscapes/Waterfalls,0.96
309,Dilwara Jain Temples,Monuments/Historical Architectures/Spiritual A...,0.83
365,Pench Tiger Reserve,Wildlife/Forests/National Parks,0.55
370,Sela Pass,Mountains/Landscapes/Waterfalls,0.98


# **Topics distribution across locations**

In [None]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="No. of Documents")
df_topic_distribution.columns = ['Topic', 'No. of Documents']
df_topic_distribution

Unnamed: 0,Topic,No. of Documents
0,Mountains/Landscapes/Waterfalls,107
1,Museums/Indian Culture & History,61
2,Monuments/Historical Architectures/Spiritual A...,55
3,Temples/Church/Worship,50
4,Beach/Seashores,42
5,Palace/Forts/Ancient Buildings,34
6,Wildlife/Forests/National Parks,33
7,Gardens/City Parks,29
8,Riverbank/Pilgrimage,24


# **Topic-Keyword Matrix**

In [None]:
def topic_word_matrix(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    keywords_weights = []
    normalized_weights = []
    weight_total = lda_model.components_.sum(axis=1)
    for i, weight in enumerate(weight_total):
      normalized_weights.append(lda_model.components_[i] / weight)

    for topic_weights in normalized_weights:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
        keywords_weights.append(topic_weights.take(top_keyword_locs))
    return topic_keywords,keywords_weights

topic_keywords, topic_keywords_weights = topic_word_matrix(vectorizer=vectorizer, lda_model=best_lda_model, n_words=20)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
#df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords.index = topicnames
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Monuments/Historical Architectures/Spiritual Attractions,temple,cave,guide,beautiful,architecture,history,build,monument,carving,worth,sculpture,carve,site,amazing,ancient,stone,hour,rock,complex,structure
Mountains/Landscapes/Waterfalls,reach,beautiful,road,water,drive,experience,point,snow,mountain,enjoy,fall,hour,beauty,trip,travel,climb,lake,small,nature,amazing
Beach/Seashores,beach,clean,evening,food,enjoy,water,beautiful,restaurant,shop,crowd,people,spend,shack,area,relax,sunset,hotel,morning,road,shopping
Temples/Church/Worship,temple,people,feel,church,experience,crowd,come,hour,peaceful,queue,beautiful,peace,allow,morning,main,locate,free,famous,year,evening
Wildlife/Forests/National Parks,elephant,experience,amazing,tour,ride,agra,guide,activity,spend,really,animal,staff,love,enjoy,tiger,trip,wonderful,learn,help,care
Gardens/City Parks,garden,family,beautiful,enjoy,park,city,friend,maintain,spend,tomb,flower,evening,area,food,morning,tree,open,huge,watch,attraction
Palace/Forts/Ancient Buildings,beautiful,history,palace,fort,build,guide,building,bridge,light,worth,architecture,city,amazing,monument,people,evening,structure,historical,tour,tourist
Museums/Indian Culture & History,museum,history,life,collection,indian,interesting,display,culture,worth,live,thing,hour,people,maintain,learn,painting,different,really,house,spend
Riverbank/Pilgrimage,boat,ride,river,water,lake,evening,enjoy,experience,ghat,people,ganga,gange,watch,beautiful,morning,activity,flow,hour,trip,night


In [None]:
#topicnames = ['Waterfalls/Landscapes', 'Wildlife/Forests', 'Riverbank/Pilgrimage', 'Museum/Historical Attractions', 'Traditional/Culture/Tribal Arts',
#              'Trekking/Scenic Location', 'Temples/Worship/Spiritual', 'Beach/Seashores', 'Hills/Mountains/Valley', 'Church/Ancient Buildings']


In [None]:
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_list = []
    normalized_weights = []
    weight_total = lda_model.components_.sum(axis=1)
    for i, weight in enumerate(weight_total):
      normalized_weights.append(lda_model.components_[i] / weight)

    for topic_num, topic_weights in enumerate(normalized_weights):
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords = keywords.take(top_keyword_locs)
        keywords_weights = topic_weights.take(top_keyword_locs)
        
        topic_text=''
        for i in zip(keywords_weights,topic_keywords):
          topic_text = topic_text + str(round(i[0],4)) +'*' + i[1] + ' + '
        topic_list.append((topic_num,topic_text[:-3]))
    return topic_list

show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=20)

[(0,
  '0.0343*temple + 0.0341*cave + 0.0185*guide + 0.0149*beautiful + 0.0131*architecture + 0.0124*history + 0.011*build + 0.0108*monument + 0.0101*carving + 0.01*worth + 0.0096*sculpture + 0.0091*carve + 0.0089*site + 0.0088*amazing + 0.0083*ancient + 0.0083*stone + 0.0077*hour + 0.0075*rock + 0.007*complex + 0.007*structure'),
 (1,
  '0.0214*reach + 0.0179*beautiful + 0.0175*road + 0.0136*water + 0.0118*drive + 0.0117*experience + 0.0116*point + 0.0113*snow + 0.0111*mountain + 0.0102*enjoy + 0.01*fall + 0.009*hour + 0.0088*beauty + 0.0077*trip + 0.0069*travel + 0.0067*climb + 0.0067*lake + 0.0066*small + 0.0065*nature + 0.0064*amazing'),
 (2,
  '0.0562*beach + 0.0239*clean + 0.0185*evening + 0.0177*food + 0.0172*enjoy + 0.0172*water + 0.0158*beautiful + 0.0154*restaurant + 0.0144*shop + 0.0126*crowd + 0.0124*people + 0.0097*spend + 0.009*shack + 0.0089*area + 0.0086*relax + 0.0076*sunset + 0.0071*hotel + 0.0068*morning + 0.0064*road + 0.0064*shopping'),
 (3,
  '0.0285*temple + 0.01

# **Topics Visualization**

In [None]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
panel

In [None]:
topic_keywords_weights

In [None]:
pip show gensim

In [None]:
# Define function to predict topic for a given text document.
nlp = spacy.load('en', disable=['parser', 'ner'])

def predict_topic(text, nlp=nlp):
    global sent_to_words
    global lemmatization

    # Step 1: Clean with simple_preprocess
    mytext_2 = list(sent_to_words(text))

    # Remove Stop Words
    mytext_3 = remove_stopwords(mytext_2)

    # Form Bigrams
    mytext_4 = make_bigrams(mytext_3)

    # Form Bigrams
    mytext_5 = make_trigrams(mytext_4)

    # Step 2: Lemmatize
    mytext_6 = lemmatization(mytext_5, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Step 3: Vectorize transform
    mytext_7 = vectorizer.transform(mytext_6)

    # Step 4: LDA Transform
    topic_probability_scores = best_lda_model.transform(mytext_7)
    topic_words = df_topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist()
    topic = df_topic_keywords.index[np.argmax(topic_probability_scores)]
    return topic, topic_words, topic_probability_scores


In [None]:
# Predict the topic
mytext = ["The Shree Jagannath Temple is an important Hindu temple dedicated to Jagannath, a form of Vishnu, in Puri in the state of Odisha on the eastern coast of India. The present temple was rebuilt from the 10th century onwards, on the site of an earlier temple, and begun by King Anantavarman Chodaganga Deva, first of the Eastern Ganga dynasty.[1]The Puri temple is famous for its annual Ratha yatra, or chariot festival, in which the three principal deities are pulled on huge and elaborately decorated temple cars. These gave their name to the English term 'Juggernaut'. Unlike the stone and metal icons found in most Hindu temples, the image of Jagannath is made of wood and is ceremoniously replaced every twelve or nineteen years by an exact replica.[2] It is one of the Char Dham.The temple is sacred to all Hindus and especially in those of the Vaishnava traditions. Many great saints, such as Ramananda and Ramanuja, were closely associated with the temple. Ramanuja established the Emar Mutt near the temple and Adi Shankaracharya established the Govardhana Mutt, which is the seat of one of the four Shankaracharyas. It is also of particular significance to the followers of the Gaudiya Vaishnavism whose founder Chaitanya Mahaprabhu, was attracted to the deity, Jagannath, and lived in Puri for many years"]
mytext2 = ["Bannerghatta National Park, near Bangalore, Karnataka, was founded in 1970 and declared as a national park in 1974.[1] In 2002 a portion of the park, became a biological reserve, the Bannerghatta Biological Park. It is a popular tourist destination with a zoo, a pet corner, an animal rescue centre, a butterfly enclosure, an aquarium, a snake house and a safari park.[2] There are ancient temples in the park for worship and it is a destination for trekking and hiking. The Zoo Authority of Karnataka, the University of Agricultural Sciences, Bangalore, and the Ashoka Trust for Research in Ecology and Environment (ATREE), Bangalore, are collaborating agencies. Within the national park area are six rural villages enclosed within three large enclosures for sheep and cattle farming.[3] This park offers a wide range of diverse wildlife to the exploradoras. Coming from the finest of Bengaluru, Karnataka, this park offers a guided bus tour all along the 6 km safari roads, which is specially made for safarists and foreign tourist gatherers."]
topic, topic_words, prob_scores = predict_topic(text = mytext2)
print(topic, prob_scores)
print('\n',topic_words)

In [None]:
# Predict the topic
mytext = ["The Shree Jagannath Temple is an important Hindu temple dedicated to Jagannath, a form of Vishnu, in Puri in the state of Odisha on the eastern coast of India. The present temple was rebuilt from the 10th century onwards, on the site of an earlier temple, and begun by King Anantavarman Chodaganga Deva, first of the Eastern Ganga dynasty.[1]The Puri temple is famous for its annual Ratha yatra, or chariot festival, in which the three principal deities are pulled on huge and elaborately decorated temple cars. These gave their name to the English term 'Juggernaut'. Unlike the stone and metal icons found in most Hindu temples, the image of Jagannath is made of wood and is ceremoniously replaced every twelve or nineteen years by an exact replica.[2] It is one of the Char Dham.The temple is sacred to all Hindus and especially in those of the Vaishnava traditions. Many great saints, such as Ramananda and Ramanuja, were closely associated with the temple. Ramanuja established the Emar Mutt near the temple and Adi Shankaracharya established the Govardhana Mutt, which is the seat of one of the four Shankaracharyas. It is also of particular significance to the followers of the Gaudiya Vaishnavism whose founder Chaitanya Mahaprabhu, was attracted to the deity, Jagannath, and lived in Puri for many years"]

mytext2 = ["Bannerghatta National Park, near Bangalore, Karnataka, was founded in 1970 and declared as a national park in 1974.[1] In 2002 a portion of the park, became a biological reserve, the Bannerghatta Biological Park. It is a popular tourist destination with a zoo, a pet corner, an animal rescue centre, a butterfly enclosure, an aquarium, a snake house and a safari park.[2] There are ancient temples in the park for worship and it is a destination for trekking and hiking. The Zoo Authority of Karnataka, the University of Agricultural Sciences, Bangalore, and the Ashoka Trust for Research in Ecology and Environment (ATREE), Bangalore, are collaborating agencies. Within the national park area are six rural villages enclosed within three large enclosures for sheep and cattle farming.[3] This park offers a wide range of diverse wildlife to the exploradoras. Coming from the finest of Bengaluru, Karnataka, this park offers a guided bus tour all along the 6 km safari roads, which is specially made for safarists and foreign tourist gatherers."]

mytext3 = ["RK Beach also known as Ramakrishna Beach is situated on the East coast of Bay of Bengal in Visakhapatnam, Andhra Pradesh.[1] It is located near Dolphin's Nose.[2] One of the most popular beaches of Vizag is Ramakrishna beach, more commonly known as RK beach. RK Beach gets its name from the Ramakrishna Mission ashram situated near the beach. It is one of the very pleasant and most crowded beach in the country With its serene waters and cool atmosphere, the beach offers a wonderful view of the countryside."]

topic, topic_words, prob_scores = predict_topic(text = mytext3)
print(topic, np.max(prob_scores))
print('\n',topic_words)

In [None]:
import pickle

# Save LDA Model
with open('/content/drive/MyDrive/Colab Notebooks/LDA_Model#1.pkl', 'wb') as f:
  pickle.dump(best_lda_model, f)


In [None]:
import pickle

# Load LDA Model
with open('/content/drive/MyDrive/Colab Notebooks/LDA_Model#1.pkl', 'rb') as f:
  best_lda_model = pickle.load(f)

# **Word2vec Model**

In [None]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api

In [None]:
class MySentences(object):
  def __init__(self, docs):
    self.doc_list = docs
 
  def __iter__(self):
    for idx, doc in enumerate(self.doc_list):
      yield doc.split()

# Train the Model

In [None]:
corpus_sentences = MySentences(data_lemmatized)

In [None]:
model_w2v = Word2Vec(size=80, window=10, min_count=5, workers=11, sample=1e-4, negative=10, alpha=0.02, min_alpha=0.02, iter=50)

In [None]:
model_w2v.build_vocab(corpus_sentences)

model_w2v.train(corpus_sentences, total_examples=model_w2v.corpus_count, epochs=model_w2v.iter)

In [None]:
#model_w2v.save('/content/drive/MyDrive/Colab Notebooks/model#1_W2V.word2vec')
model_w2v = gensim.models.Word2Vec.load('/content/drive/MyDrive/Colab Notebooks/model#1_W2V.word2vec')

In [None]:
model_w2v['temple']

In [None]:
model_w2v.most_similar('goa')

In [None]:
model = Word2Vec.load("word2vec.model")

# **Topic Vectors**

In [None]:
n_topic=10
vec_size=80
topic_vectors = {}
for topic in range(n_topic):
    topic_vec = np.repeat(0, vec_size)
    ky_word = top_keywords[topic]
    #print(ky_word)
    topic_no = 'Topic'+str(topic)
    total = df_topic_keywords.loc[topic_no, ky_word].sum()
    #print(total)
    for word in ky_word:
        word_wt = df_topic_keywords.loc[topic_no, word]/total
        word_vec = w2v_model[word]*word_wt
        topic_vec = topic_vec + word_vec
  
    topic_vectors[topic_no] = topic_vec

In [None]:
np.zeros(80)

In [None]:
topic_vector = np.zeros(80)
for weight_word in zip(topic_keywords_weights[7],topic_keywords[7]):
  topic_vector += weight_word[0]*model_w2v[weight_word[1]]

topic_vector = 0.46*topic_vector
topic_vector

In [None]:
#topic_keywords_weights
#test_vec = 0.45*(model_w2v['temple'] + model_w2v['peace'])
model_w2v.most_similar(positive=[topic_vector],topn=10)

In [None]:
def get_topic_vectors(w2v_model,topics,term_weights):
  topic_vector_list = []

  for i in range(len(topics)):
    temp_vector = np.zeros(80)
    for weight_word in zip(term_weights[i],topics[i]):
      temp_vector += weight_word[0]*w2v_model[weight_word[1]]
    topic_vector_list.append((('Topic-'+str(i)),temp_vector))

  return topic_vector_list

In [None]:
topic_vector_list = get_topic_vectors(model_w2v,topic_keywords,topic_keywords_weights)
topic_vector_list

[('Topic-0',
  array([-0.27819448, -0.04272031, -0.18766874, -0.23621891, -0.05253636,
         -0.25335565,  0.15144888,  0.03226262, -0.35189508,  0.02734476,
         -0.23444504, -0.01439063,  0.20103021,  0.0301706 , -0.51749533,
          0.21039478,  0.18009804,  0.13746514,  0.3525527 ,  0.34784858,
          0.07699154, -0.04785294,  0.13587986,  0.12157999,  0.13432395,
         -0.00870262, -0.24728987,  0.25421404,  0.37835333,  0.13323695,
          0.21991089, -0.25463987,  0.06458555, -0.3224823 , -0.52903655,
         -0.28913909,  0.25917541,  0.24307502, -0.20268   , -0.32933946,
         -0.33260485,  0.07449881,  0.21194794, -0.12554846,  0.23432018,
         -0.08908046,  0.1415873 ,  0.34728431,  0.16614274,  0.2890216 ,
         -0.00158323,  0.19393955, -0.32553519, -0.07081958, -0.03242647,
          0.07288117,  0.26593235, -0.12833374,  0.16355714, -0.15081034,
          0.02108227, -0.26776929,  0.0840691 , -0.17808715, -0.12717598,
         -0.07585774, -0.

In [None]:
import pickle

# Load LDA Model
with open('/content/drive/MyDrive/Colab Notebooks/topic_vectors#1.pkl', 'rb') as f:
  best_lda_model = pickle.load(f)

In [None]:
#prob_scores

def infer_topic_vector(topics_vec_list, topic_prob):
  topic_vector = np.zeros(80)

  for i in range(len(topic_prob[0])):
    #temp_vector = np.zeros(80)
    #for vector, prob in zip(topics_vec_list,topic_prob):
    topic_vector += topic_prob[0][i]*topics_vec_list[i][1]
  #topic_vector_list.append((('Topic-'+str(i)),temp_vector))

  return topic_vector

In [None]:
from gensim.models import Doc2Vec
model_d2v = Doc2Vec.load('/content/drive/MyDrive/Colab Notebooks/model#1_D2V.doc2vec')

In [None]:
prob_scores

In [None]:
vector = infer_topic_vector(topic_vector_list,prob_scores)

In [None]:
model_d2v.docvecs.most_similar(positive=[topic_vector],topn=10)