In [1]:
import pandas as pd
import numpy as np
import pickle

In [22]:
! pip install gensim

Collecting gensim
  Downloading gensim-4.0.1-cp37-cp37m-win_amd64.whl (23.9 MB)
Collecting Cython==0.29.21
  Downloading Cython-0.29.21-cp37-cp37m-win_amd64.whl (1.6 MB)
Collecting smart-open>=1.8.1
  Downloading smart_open-5.1.0-py3-none-any.whl (57 kB)
Installing collected packages: Cython, smart-open, gensim
  Attempting uninstall: Cython
    Found existing installation: Cython 0.29.15
    Uninstalling Cython-0.29.15:
      Successfully uninstalled Cython-0.29.15
Successfully installed Cython-0.29.21 gensim-4.0.1 smart-open-5.1.0


In [2]:
import gensim
from gensim import matutils , models , corpora
import scipy.sparse



In [3]:
df_english = pd.read_pickle('english_reviews.pkl')

In [4]:
df_english

Unnamed: 0,reviews,ratings,sentiment
0,really liked kids loved big large space kept r...,5,1.0
1,beautiful place familys enjoy little bit natur...,5,1.0
2,nice park smack middle city,5,1.0
3,nice things park varying outdoor areas useful ...,3,0.0
4,absolutely beautiful grounds well maintained m...,5,1.0
...,...,...,...
956,seem exist,1,0.0
957,expensive,1,0.0
958,,5,1.0
959,,3,0.0


### Topic Modeling - Attempt #1

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
cv = CountVectorizer()

In [7]:
data_cv = cv.fit_transform(df_english['reviews']).toarray()
features = cv.get_feature_names()

In [8]:
df = pd.DataFrame(data_cv , columns = features)

In [10]:
df

Unnamed: 0,013,04,10,100,11,11pm,12,13,15,165,...,youngest,youre,zen,zero,zip,zipline,zoo,zoology,zs,époques
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
957,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
958,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
959,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# One of the major input for gensim is Term Document Matrix
term_document_matrix = df.transpose()
term_document_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,951,952,953,954,955,956,957,958,959,960
013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zipline,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoology,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zs,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# We need to convert our Term Document Matrix into a new format which is used by gensim
# i.e. Term Document Matrix -> Sparse Matrix 
# and then Sparse Matrix -> Gensim Corpus

sparse_matrix = scipy.sparse.csr_matrix(term_document_matrix)
corpus = matutils.Sparse2Corpus(sparse_matrix)  # Converts a matrix in scipy.sparse format into a streaming gensim corpus.


In [13]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
id2word = corpora.Dictionary([feature.split() for feature in features])

In [14]:
# Number of Topics = 2
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=40)
lda.print_topics()

[(0,
  '0.019*"park" + 0.014*"kids" + 0.008*"play" + 0.007*"large" + 0.006*"space" + 0.005*"well" + 0.005*"good" + 0.005*"water" + 0.005*"big" + 0.005*"field"'),
 (1,
  '0.037*"park" + 0.027*"place" + 0.023*"nice" + 0.015*"great" + 0.012*"beautiful" + 0.009*"small" + 0.009*"walk" + 0.008*"montreal" + 0.007*"kids" + 0.007*"good"')]

In [15]:
# Number of Topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=40)
lda.print_topics()

[(0,
  '0.018*"park" + 0.016*"place" + 0.011*"kids" + 0.011*"great" + 0.009*"water" + 0.008*"beautiful" + 0.007*"area" + 0.007*"people" + 0.007*"good" + 0.007*"well"'),
 (1,
  '0.046*"park" + 0.015*"nice" + 0.010*"great" + 0.009*"place" + 0.008*"dog" + 0.008*"city" + 0.008*"one" + 0.007*"kids" + 0.006*"like" + 0.006*"picnic"'),
 (2,
  '0.027*"place" + 0.026*"nice" + 0.025*"park" + 0.014*"walk" + 0.013*"beautiful" + 0.011*"kids" + 0.010*"great" + 0.010*"small" + 0.008*"go" + 0.008*"good"')]

In [16]:
# Number of Topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=40)
lda.print_topics()

[(0,
  '0.049*"park" + 0.015*"nice" + 0.011*"great" + 0.011*"kids" + 0.011*"small" + 0.010*"place" + 0.009*"dog" + 0.009*"space" + 0.009*"play" + 0.008*"area"'),
 (1,
  '0.036*"park" + 0.027*"place" + 0.027*"nice" + 0.020*"great" + 0.012*"kids" + 0.011*"beautiful" + 0.011*"walk" + 0.011*"go" + 0.010*"montreal" + 0.009*"family"'),
 (2,
  '0.013*"kids" + 0.012*"good" + 0.011*"place" + 0.009*"many" + 0.009*"well" + 0.009*"park" + 0.008*"nice" + 0.007*"day" + 0.007*"cool" + 0.006*"enjoy"'),
 (3,
  '0.022*"place" + 0.014*"walk" + 0.013*"city" + 0.013*"beautiful" + 0.011*"park" + 0.009*"love" + 0.009*"nice" + 0.009*"go" + 0.008*"really" + 0.008*"large"')]

### Topic Modeling - Attempt #2 (Nouns)

In [26]:
# Let's create a function to pull out nouns from a string of text
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag
def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

[nltk_data] Downloading package punkt to C:\Users\Abhay
[nltk_data]     Mahajan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Abhay Mahajan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [19]:
df_english2 = df_english

In [28]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(df_english2['reviews'].apply(nouns))
data_nouns.head()

Unnamed: 0,reviews
0,kids space playground slides kids games weathe...
1,place familys city enjoy hike trails kids jung...
2,park smack city
3,things park areas dog enclosure tables kids vi...
4,grounds tables wait trees bloom


In [47]:
from sklearn.feature_extraction import text
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [48]:
cv_noun = CountVectorizer(stop_words = stop_words)
data_cv_noun = cv_noun.fit_transform(data_nouns['reviews']).toarray()
features_noun = cv_noun.get_feature_names()

In [49]:
df2 = pd.DataFrame(data_cv_noun , columns = features_noun)

In [50]:
df2

Unnamed: 0,absolute,access,accessibility,accessories,account,activities,activity,addicts,admire,adolescents,...,xylophone,xztra,year,years,ymca,yo,york,zoo,zoology,époques
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
957,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
958,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
959,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
term_document_matrix_noun = df2.transpose()
term_document_matrix_noun

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,951,952,953,954,955,956,957,958,959,960
absolute,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
access,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accessibility,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accessories,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
account,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
york,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoology,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
sparse_matrix_noun = scipy.sparse.csr_matrix(term_document_matrix_noun)
corpus_noun = matutils.Sparse2Corpus(sparse_matrix_noun)

In [53]:
id2word_noun = corpora.Dictionary([feature.split() for feature in features_noun])

In [54]:
len(id2word_noun)

1385

In [78]:
# Number of Topics = 2
lda_noun = models.LdaModel(corpus=corpus_noun, id2word=id2word_noun, num_topics=2, passes=80)
lda_noun.print_topics()

[(0,
  '0.064*"place" + 0.012*"dogs" + 0.012*"day" + 0.011*"city" + 0.011*"montreal" + 0.010*"summer" + 0.010*"area" + 0.009*"walk" + 0.009*"water" + 0.008*"parc"'),
 (1,
  '0.095*"park" + 0.032*"kids" + 0.018*"place" + 0.013*"space" + 0.012*"water" + 0.012*"family" + 0.011*"dog" + 0.011*"area" + 0.010*"fun" + 0.009*"city"')]

In [61]:
# Number of Topics = 3
lda_noun = models.LdaModel(corpus=corpus_noun, id2word=id2word_noun, num_topics=3, passes=80 , iterations = 400)
lda_noun.print_topics()

[(0,
  '0.084*"park" + 0.024*"place" + 0.021*"space" + 0.020*"kids" + 0.018*"fun" + 0.013*"water" + 0.010*"dog" + 0.010*"trees" + 0.009*"area" + 0.008*"spot"'),
 (1,
  '0.048*"park" + 0.036*"place" + 0.032*"kids" + 0.015*"water" + 0.013*"lots" + 0.012*"field" + 0.010*"children" + 0.008*"soccer" + 0.008*"area" + 0.008*"views"'),
 (2,
  '0.048*"place" + 0.043*"park" + 0.021*"family" + 0.020*"city" + 0.019*"walk" + 0.015*"area" + 0.015*"montreal" + 0.012*"day" + 0.011*"friends" + 0.010*"view"')]

In [62]:
# Number of Topics = 4
lda_noun = models.LdaModel(corpus=corpus_noun, id2word=id2word_noun, num_topics=4, passes=80 , iterations = 400)
lda_noun.print_topics()

[(0,
  '0.073*"place" + 0.036*"kids" + 0.025*"park" + 0.015*"parc" + 0.012*"summer" + 0.011*"montreal" + 0.011*"field" + 0.011*"city" + 0.010*"fun" + 0.009*"soccer"'),
 (1,
  '0.095*"park" + 0.024*"kids" + 0.019*"water" + 0.018*"place" + 0.018*"space" + 0.017*"area" + 0.014*"city" + 0.013*"lots" + 0.010*"benches" + 0.009*"trees"'),
 (2,
  '0.031*"place" + 0.022*"walk" + 0.020*"view" + 0.017*"area" + 0.017*"beach" + 0.013*"park" + 0.012*"dogs" + 0.011*"picnic" + 0.008*"kind" + 0.008*"theres"'),
 (3,
  '0.079*"park" + 0.022*"dog" + 0.018*"place" + 0.017*"family" + 0.014*"friends" + 0.012*"water" + 0.011*"lake" + 0.011*"kids" + 0.009*"fun" + 0.008*"games"')]

### Topic Modeling - Attempt #3 (Nouns and Adjectives)

In [89]:
# Let's create a function to pull out nouns and adjectives from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [91]:
df_english3 = df_english

In [113]:
data_nouns_adj = pd.DataFrame(df_english3['reviews'].apply(nouns_adj))
data_nouns_adj.head()

Unnamed: 0,reviews
0,liked kids big large space playground slides k...
1,beautiful place familys little bit nature city...
2,nice park smack middle city
3,nice things park outdoor areas useful offleash...
4,beautiful grounds many picnic tables cant wait...


In [144]:
cv_noun_adj = CountVectorizer(max_df=.01) # max_df=.8 means "It ignores terms that appear in more than 80% of the documents".
data_cv_noun_adj = cv_noun_adj.fit_transform(data_nouns_adj['reviews']).toarray()
features_noun_adj = cv_noun_adj.get_feature_names()

In [145]:
df3 = pd.DataFrame(data_cv_noun_adj , columns = features_noun_adj)

In [147]:
term_document_matrix_noun_adj = df3.transpose()
term_document_matrix_noun_adj

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,951,952,953,954,955,956,957,958,959,960
able,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
absolute,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accessibility,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accessible,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accessories,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zipline,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoology,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zs,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [148]:
sparse_matrix_noun_adj = scipy.sparse.csr_matrix(term_document_matrix_noun_adj)
corpus_noun_adj = matutils.Sparse2Corpus(sparse_matrix_noun_adj)

In [226]:
id2word_noun_adj = corpora.Dictionary([features_noun_adj])

In [238]:
# Number of Topics = 2
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=2, passes=80)
lda_noun_adj.print_topics()

[(0,
  '0.004*"cream" + 0.004*"feel" + 0.003*"watch" + 0.003*"traffic" + 0.003*"history" + 0.003*"early" + 0.003*"greenery" + 0.003*"town" + 0.003*"adults" + 0.003*"bigger"'),
 (1,
  '0.005*"real" + 0.003*"wan" + 0.003*"hidden" + 0.003*"sport" + 0.003*"observation" + 0.003*"bus" + 0.003*"respect" + 0.003*"feels" + 0.003*"mount" + 0.003*"ages"')]

In [239]:
# Number of Topics = 3
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=3, passes=20)
lda_noun_adj.print_topics()

[(0,
  '0.005*"wan" + 0.005*"see" + 0.004*"cute" + 0.004*"hidden" + 0.004*"complete" + 0.004*"visitors" + 0.004*"variety" + 0.004*"cream" + 0.004*"construction" + 0.004*"feel"'),
 (1,
  '0.005*"smaller" + 0.004*"traffic" + 0.004*"mean" + 0.004*"bus" + 0.004*"mount" + 0.004*"respect" + 0.004*"greenery" + 0.004*"sport" + 0.004*"youre" + 0.004*"evenings"'),
 (2,
  '0.006*"real" + 0.005*"rest" + 0.005*"super" + 0.004*"favourite" + 0.004*"town" + 0.004*"enough" + 0.004*"google" + 0.004*"suitable" + 0.004*"pad" + 0.004*"recommend"')]

In [240]:
# Number of Topics = 4
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=4, passes=20)
lda_noun_adj.print_topics()

[(0,
  '0.006*"natural" + 0.006*"mount" + 0.006*"bus" + 0.006*"birds" + 0.006*"respect" + 0.006*"canal" + 0.005*"heart" + 0.005*"rules" + 0.005*"youre" + 0.005*"short"'),
 (1,
  '0.006*"wan" + 0.005*"evenings" + 0.005*"weekday" + 0.004*"artificial" + 0.004*"steps" + 0.004*"location" + 0.004*"observation" + 0.004*"sights" + 0.004*"couples" + 0.004*"deck"'),
 (2,
  '0.006*"restaurants" + 0.006*"traffic" + 0.005*"bigger" + 0.005*"mean" + 0.005*"round" + 0.005*"eat" + 0.005*"available" + 0.004*"wish" + 0.004*"coffee" + 0.004*"history"'),
 (3,
  '0.010*"real" + 0.006*"dollars" + 0.005*"hidden" + 0.005*"google" + 0.005*"ways" + 0.005*"foot" + 0.005*"map" + 0.005*"pick" + 0.005*"consult" + 0.005*"version"')]

### Applying on all the reviews dataset

In [242]:
df_all = pd.read_csv(r"C:\Users\Abhay Mahajan\Downloads\ParkReviews.csv")

In [347]:
df_all = df_all.dropna()

In [348]:
df_all['review_text']

0        One of the nicest entry points to this invitin...
1        Waterfront to fish or just relax, great place ...
2              Everything except the parking is good here.
3        Defenely the best park in Montreal East, Tetre...
4          It's so peaceful and happy place near the water
                               ...                        
64715                           Great spot in old Montreal
64716                                            Beautiful
64717                                              History
64718                                        Picturesque 😎
64719                             Beautiful place to visit
Name: review_text, Length: 29731, dtype: object

In [244]:
df_all_reviews = df_all['review_text']
df_all_reviews = pd.DataFrame(df_all_reviews)

In [253]:
df_all_reviews = df_all_reviews.dropna()

In [349]:
df_all_reviews

Unnamed: 0,review_text
0,entry points waterfront promenade St-Lawrence ...
1,Waterfront place hangout sunset place
2,Everything parking
3,park Montreal East Tetreauville area park plen...
4,place water
...,...
64715,Great spot Montreal
64716,
64717,History
64718,Picturesque 😎


In [258]:
import googletrans
from googletrans import Translator 
translator = Translator()

### Applying nouns function Attempt#4

In [268]:
df_all_reviews = pd.DataFrame(df_all_reviews['review_text'].apply(nouns))

In [321]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\Abhay
[nltk_data]     Mahajan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [322]:
add_stop_words = ['Translated' , 'by' , 'Google']
stop_words_french = (stopwords.words('french'))
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
stop_words = stop_words.union(stop_words_french)

In [323]:
cv_noun = CountVectorizer(stop_words = stop_words)
data_cv_noun = cv_noun.fit_transform(df_all_reviews['review_text']).toarray()
features_noun = cv_noun.get_feature_names()

  'stop_words.' % sorted(inconsistent))


In [324]:
df5 = pd.DataFrame(data_cv_noun , columns = features_noun)

In [325]:
term_document_matrix_noun = df5.transpose()
term_document_matrix_noun
sparse_matrix_noun = scipy.sparse.csr_matrix(term_document_matrix_noun)
corpus_noun = matutils.Sparse2Corpus(sparse_matrix_noun)

In [326]:
id2word_noun = corpora.Dictionary([features_noun])

In [327]:
len(id2word_noun)

17600

In [328]:
# Number of Topics = 2
lda_noun = models.LdaModel(corpus=corpus_noun, id2word=id2word_noun, num_topics=2, passes=10)
lda_noun.print_topics()

[(0,
  '0.098*"google" + 0.097*"original" + 0.037*"parc" + 0.033*"park" + 0.020*"très" + 0.018*"beau" + 0.015*"beautiful" + 0.009*"bien" + 0.009*"children" + 0.009*"place"'),
 (1,
  '0.056*"park" + 0.053*"place" + 0.022*"nice" + 0.015*"great" + 0.014*"montreal" + 0.013*"kids" + 0.010*"summer" + 0.010*"nature" + 0.009*"city" + 0.009*"lots"')]

In [329]:
# Number of Topics = 3
lda_noun = models.LdaModel(corpus=corpus_noun, id2word=id2word_noun, num_topics=3, passes=10)
lda_noun.print_topics()

[(0,
  '0.085*"park" + 0.020*"kids" + 0.018*"nice" + 0.016*"soccer" + 0.015*"pool" + 0.013*"water" + 0.013*"space" + 0.013*"area" + 0.012*"lots" + 0.011*"children"'),
 (1,
  '0.098*"place" + 0.020*"montreal" + 0.014*"nature" + 0.014*"city" + 0.013*"great" + 0.012*"walk" + 0.012*"park" + 0.011*"view" + 0.011*"time" + 0.011*"summer"'),
 (2,
  '0.127*"google" + 0.125*"original" + 0.048*"parc" + 0.035*"park" + 0.026*"très" + 0.023*"beau" + 0.023*"beautiful" + 0.012*"bien" + 0.010*"enfants" + 0.010*"super"')]

In [330]:
# Number of Topics = 4
lda_noun = models.LdaModel(corpus=corpus_noun, id2word=id2word_noun, num_topics=4, passes=10)
lda_noun.print_topics()

[(0,
  '0.137*"place" + 0.028*"montreal" + 0.020*"nature" + 0.019*"city" + 0.017*"walk" + 0.015*"view" + 0.012*"montréal" + 0.011*"nice" + 0.011*"time" + 0.010*"family"'),
 (1,
  '0.045*"google" + 0.044*"original" + 0.011*"para" + 0.006*"calm" + 0.006*"muy" + 0.006*"lugar" + 0.005*"parque" + 0.004*"peaceful" + 0.004*"ok" + 0.004*"ronde"'),
 (2,
  '0.106*"park" + 0.023*"kids" + 0.023*"nice" + 0.019*"great" + 0.018*"soccer" + 0.016*"area" + 0.016*"lots" + 0.014*"water" + 0.013*"pool" + 0.012*"winter"'),
 (3,
  '0.115*"google" + 0.114*"original" + 0.048*"parc" + 0.038*"park" + 0.025*"très" + 0.023*"beau" + 0.017*"beautiful" + 0.012*"bien" + 0.010*"enfants" + 0.010*"super"')]

### Applying Nouns and Adjectives Function

In [306]:
data_nouns_adj = pd.DataFrame(df_all_reviews['review_text'].apply(nouns_adj))
data_nouns_adj.head()

Unnamed: 0,review_text
0,entry points waterfront promenade St-Lawrence ...
1,Waterfront place hangout sunset place
2,Everything parking
3,park Montreal East Tetreauville area park plen...
4,place water


In [307]:
# add_stop_words = ['translated' , 'by' , 'google']
# stop_words_french = (stopwords.words('french'))
# stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
# stop_words = stop_words.union(stop_words_french)

In [331]:
cv_noun_adj = CountVectorizer(stop_words = stop_words) # max_df=.8 means "It ignores terms that appear in more than 80% of the documents".
data_cv_noun_adj = cv_noun_adj.fit_transform(data_nouns_adj['review_text']).toarray()
features_noun_adj = cv_noun_adj.get_feature_names()

In [332]:
df6 = pd.DataFrame(data_cv_noun_adj , columns = features_noun_adj)

In [333]:
term_document_matrix_noun_adj = df6.transpose()
term_document_matrix_noun_adj

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29721,29722,29723,29724,29725,29726,29727,29728,29729,29730
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
138,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
하기,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
하여,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
현대사박물관,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
휴식하기,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [334]:
sparse_matrix_noun_adj = scipy.sparse.csr_matrix(term_document_matrix_noun_adj)
corpus_noun_adj = matutils.Sparse2Corpus(sparse_matrix_noun_adj)

In [335]:
id2word_noun_adj = corpora.Dictionary([features_noun_adj])

In [336]:
len(id2word_noun_adj)

16224

In [337]:
# Number of Topics = 2
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=2, passes=10)
lda_noun_adj.print_topics()

[(0,
  '0.085*"park" + 0.055*"original" + 0.055*"google" + 0.044*"parc" + 0.022*"beautiful" + 0.022*"beau" + 0.019*"nice" + 0.016*"très" + 0.013*"children" + 0.013*"great"'),
 (1,
  '0.071*"original" + 0.071*"google" + 0.058*"place" + 0.013*"montreal" + 0.009*"city" + 0.009*"nature" + 0.008*"très" + 0.007*"bien" + 0.006*"view" + 0.006*"people"')]

In [338]:
# Number of Topics = 3
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=3, passes=10)
lda_noun_adj.print_topics()

[(0,
  '0.069*"park" + 0.065*"place" + 0.032*"nice" + 0.022*"great" + 0.019*"kids" + 0.013*"summer" + 0.012*"family" + 0.011*"area" + 0.011*"time" + 0.011*"lots"'),
 (1,
  '0.106*"original" + 0.105*"google" + 0.043*"beautiful" + 0.033*"park" + 0.033*"place" + 0.027*"parc" + 0.026*"beau" + 0.021*"très" + 0.012*"montreal" + 0.012*"belle"'),
 (2,
  '0.080*"original" + 0.080*"google" + 0.036*"parc" + 0.034*"park" + 0.016*"très" + 0.014*"children" + 0.013*"bien" + 0.012*"super" + 0.012*"enfants" + 0.010*"beau"')]

In [339]:
# Number of Topics = 4
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=4, passes=10)
lda_noun_adj.print_topics()

[(0,
  '0.016*"fun" + 0.012*"country" + 0.011*"trails" + 0.010*"cross" + 0.010*"para" + 0.009*"path" + 0.006*"island" + 0.006*"fond" + 0.006*"muy" + 0.006*"bike"'),
 (1,
  '0.177*"place" + 0.038*"montreal" + 0.025*"city" + 0.025*"nature" + 0.018*"view" + 0.017*"montréal" + 0.016*"time" + 0.016*"family" + 0.014*"day" + 0.013*"summer"'),
 (2,
  '0.119*"park" + 0.031*"nice" + 0.026*"kids" + 0.022*"great" + 0.019*"soccer" + 0.018*"area" + 0.017*"lots" + 0.015*"water" + 0.014*"baseball" + 0.013*"pool"'),
 (3,
  '0.130*"original" + 0.129*"google" + 0.047*"parc" + 0.037*"park" + 0.026*"très" + 0.023*"beau" + 0.017*"beautiful" + 0.012*"bien" + 0.010*"super" + 0.010*"enfants"')]

In [351]:
# Number of Topics = 10
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=10, passes=10)
lda_noun_adj.print_topics()

[(0,
  '0.095*"great" + 0.059*"soccer" + 0.039*"summer" + 0.037*"tennis" + 0.033*"winter" + 0.033*"field" + 0.024*"basketball" + 0.023*"family" + 0.023*"country" + 0.021*"time"'),
 (1,
  '0.098*"original" + 0.096*"google" + 0.041*"children" + 0.034*"super" + 0.033*"enfants" + 0.019*"games" + 0.017*"water" + 0.014*"eau" + 0.012*"good" + 0.011*"jeux"'),
 (2,
  '0.079*"montreal" + 0.054*"city" + 0.053*"nature" + 0.038*"view" + 0.034*"montréal" + 0.025*"river" + 0.020*"path" + 0.016*"spot" + 0.015*"bike" + 0.013*"st"'),
 (3,
  '0.244*"park" + 0.096*"nice" + 0.040*"baseball" + 0.038*"beautiful" + 0.038*"dog" + 0.019*"beach" + 0.017*"dogs" + 0.012*"family" + 0.008*"spaces" + 0.008*"field"'),
 (4,
  '0.027*"year" + 0.026*"benches" + 0.024*"music" + 0.020*"food" + 0.018*"people" + 0.018*"ice" + 0.017*"times" + 0.016*"ski" + 0.016*"ground" + 0.016*"metro"'),
 (5,
  '0.153*"original" + 0.152*"google" + 0.079*"parc" + 0.067*"park" + 0.039*"beau" + 0.039*"très" + 0.028*"beautiful" + 0.014*"bien" +

In [197]:
model_vis = gensimvis.prepare(lda_noun_adj, corpus_noun_adj, id2word_noun_adj)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()