In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
import gensim
from gensim import matutils , models , corpora
import scipy.sparse



In [3]:
df_english = pd.read_pickle('english_reviews.pkl')

In [4]:
df_neg = df_english.loc[df_english['sentiment'] == 0]
df_neg.shape

(99, 3)

### Topic Modeling - Attempt #1

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
data_cv = cv.fit_transform(df_neg['reviews']).toarray()
features = cv.get_feature_names()

In [6]:
df = pd.DataFrame(data_cv , columns = features)

In [7]:
# One of the major input for gensim is Term Document Matrix
term_document_matrix = df.transpose()
term_document_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,89,90,91,92,93,94,95,96,97,98
013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
year,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
years,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yet,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
younger,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# We need to convert our Term Document Matrix into a new format which is used by gensim
# i.e. Term Document Matrix -> Sparse Matrix 
# and then Sparse Matrix -> Gensim Corpus

sparse_matrix = scipy.sparse.csr_matrix(term_document_matrix)
corpus = matutils.Sparse2Corpus(sparse_matrix)  # Converts a matrix in scipy.sparse format into a streaming gensim corpus.

In [9]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
id2word = corpora.Dictionary([features])

In [10]:
# Number of Topics = 2
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=40)
lda.print_topics()

[(0,
  '0.017*"park" + 0.016*"kids" + 0.011*"much" + 0.010*"nice" + 0.010*"good" + 0.010*"like" + 0.008*"really" + 0.007*"water" + 0.007*"little" + 0.007*"place"'),
 (1,
  '0.021*"park" + 0.011*"nice" + 0.011*"small" + 0.008*"water" + 0.006*"one" + 0.006*"good" + 0.005*"area" + 0.005*"get" + 0.005*"could" + 0.005*"entrance"')]

In [11]:
# Number of Topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=40)
lda.print_topics()

[(0,
  '0.017*"park" + 0.013*"nice" + 0.010*"good" + 0.010*"water" + 0.009*"also" + 0.008*"place" + 0.008*"could" + 0.007*"much" + 0.007*"like" + 0.007*"one"'),
 (1,
  '0.031*"park" + 0.014*"kids" + 0.013*"nice" + 0.010*"much" + 0.009*"small" + 0.008*"view" + 0.008*"dog" + 0.007*"around" + 0.006*"little" + 0.006*"next"'),
 (2,
  '0.011*"like" + 0.009*"go" + 0.009*"kids" + 0.009*"good" + 0.008*"bad" + 0.008*"fun" + 0.008*"people" + 0.008*"water" + 0.008*"small" + 0.007*"little"')]

In [12]:
# Number of Topics = 10
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=10, passes=40)
lda.print_topics()

[(0,
  '0.032*"park" + 0.020*"nice" + 0.012*"small" + 0.012*"bad" + 0.008*"much" + 0.008*"go" + 0.008*"well" + 0.008*"walk" + 0.008*"take" + 0.008*"close"'),
 (1,
  '0.019*"like" + 0.015*"good" + 0.015*"park" + 0.011*"views" + 0.011*"place" + 0.011*"nice" + 0.008*"could" + 0.008*"use" + 0.008*"theres" + 0.008*"visitors"'),
 (2,
  '0.023*"park" + 0.015*"severe" + 0.015*"weekend" + 0.015*"spend" + 0.015*"friends" + 0.008*"well" + 0.008*"full" + 0.008*"clean" + 0.008*"also" + 0.008*"water"'),
 (3,
  '0.015*"small" + 0.015*"good" + 0.010*"park" + 0.010*"little" + 0.010*"looking" + 0.010*"water" + 0.010*"walk" + 0.010*"picnic" + 0.010*"area" + 0.010*"point"'),
 (4,
  '0.036*"view" + 0.036*"point" + 0.027*"park" + 0.018*"small" + 0.018*"much" + 0.018*"forest" + 0.018*"theres" + 0.018*"next" + 0.018*"dog" + 0.018*"mini"'),
 (5,
  '0.020*"nice" + 0.014*"also" + 0.014*"water" + 0.014*"park" + 0.012*"little" + 0.012*"tables" + 0.012*"kids" + 0.009*"picnic" + 0.009*"since" + 0.006*"like"'),
 (6,


### Topic Modeling - Attempt #2 (Nouns)

In [13]:
# Let's create a function to pull out nouns from a string of text
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag
def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

[nltk_data] Downloading package punkt to C:\Users\Abhay
[nltk_data]     Mahajan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Abhay Mahajan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [14]:
df_neg_2 = df_neg
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(df_neg_2['reviews'].apply(nouns))
data_nouns.head()

Unnamed: 0,reviews
3,things park areas dog enclosure tables kids vi...
26,weekend friends
47,blocks litter cans recondition mode attachment
57,potiential
76,park care grounds outremont dissapointing sign...


In [15]:
from sklearn.feature_extraction import text
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [16]:
cv_noun = CountVectorizer(stop_words = stop_words)
data_cv_noun = cv_noun.fit_transform(data_nouns['reviews']).toarray()
features_noun = cv_noun.get_feature_names()

In [17]:
df2 = pd.DataFrame(data_cv_noun , columns = features_noun)

In [18]:
term_document_matrix_noun = df2.transpose()
term_document_matrix_noun
sparse_matrix_noun = scipy.sparse.csr_matrix(term_document_matrix_noun)
corpus_noun = matutils.Sparse2Corpus(sparse_matrix_noun)

In [19]:
id2word_noun = corpora.Dictionary([features_noun])

In [20]:
# Number of Topics = 2
lda_noun = models.LdaModel(corpus=corpus_noun, id2word=id2word_noun, num_topics=2, passes=80)
lda_noun.print_topics()

[(0,
  '0.054*"park" + 0.020*"water" + 0.013*"area" + 0.011*"kids" + 0.011*"walk" + 0.009*"entrance" + 0.009*"years" + 0.009*"parents" + 0.007*"day" + 0.007*"signs"'),
 (1,
  '0.031*"kids" + 0.023*"park" + 0.016*"place" + 0.016*"water" + 0.012*"views" + 0.012*"view" + 0.010*"theres" + 0.010*"tables" + 0.010*"point" + 0.007*"things"')]

In [21]:
# Number of Topics = 3
lda_noun = models.LdaModel(corpus=corpus_noun, id2word=id2word_noun, num_topics=3, passes=80)
lda_noun.print_topics()

[(0,
  '0.035*"park" + 0.013*"place" + 0.013*"kids" + 0.010*"area" + 0.010*"day" + 0.010*"weekend" + 0.010*"water" + 0.007*"walk" + 0.007*"food" + 0.007*"information"'),
 (1,
  '0.051*"park" + 0.017*"place" + 0.017*"views" + 0.016*"view" + 0.013*"theres" + 0.013*"point" + 0.013*"water" + 0.010*"dog" + 0.010*"tables" + 0.010*"space"'),
 (2,
  '0.043*"kids" + 0.030*"water" + 0.030*"park" + 0.011*"tables" + 0.011*"bit" + 0.008*"signs" + 0.008*"walk" + 0.008*"area" + 0.008*"parents" + 0.008*"grounds"')]

In [22]:
# Number of Topics = 10
lda_noun = models.LdaModel(corpus=corpus_noun, id2word=id2word_noun, num_topics=10, passes=80)
lda_noun.print_topics()

[(0,
  '0.048*"park" + 0.029*"place" + 0.020*"water" + 0.020*"lots" + 0.020*"homeless" + 0.020*"parents" + 0.010*"sit" + 0.010*"garbage" + 0.010*"years" + 0.010*"covid"'),
 (1,
  '0.037*"park" + 0.037*"views" + 0.037*"theres" + 0.025*"day" + 0.025*"bit" + 0.025*"visitors" + 0.025*"cool" + 0.013*"area" + 0.013*"play" + 0.013*"river"'),
 (2,
  '0.037*"recondition" + 0.037*"cans" + 0.037*"mode" + 0.037*"attachment" + 0.037*"blocks" + 0.037*"litter" + 0.019*"fills" + 0.019*"lot" + 0.019*"row" + 0.019*"parking"'),
 (3,
  '0.115*"park" + 0.025*"entrance" + 0.025*"kids" + 0.019*"area" + 0.019*"water" + 0.013*"walk" + 0.013*"years" + 0.013*"tables" + 0.013*"note" + 0.013*"boulevard"'),
 (4,
  '0.038*"space" + 0.038*"soccer" + 0.020*"picnic" + 0.020*"family" + 0.020*"city" + 0.020*"location" + 0.020*"spaces" + 0.020*"players" + 0.020*"race" + 0.020*"fields"'),
 (5,
  '0.041*"kids" + 0.041*"park" + 0.041*"water" + 0.027*"signs" + 0.027*"grounds" + 0.027*"parents" + 0.027*"care" + 0.027*"dissapoi

### Topic Modeling - Attempt #3 (Nouns and Adjectives)

In [23]:
# Let's create a function to pull out nouns and adjectives from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [24]:
df_neg_3 = df_neg
data_nouns_adj = pd.DataFrame(df_neg_3['reviews'].apply(nouns_adj))
data_nouns_adj.head()

Unnamed: 0,reviews
3,nice things park outdoor areas useful offleash...
26,spend weekend friends
47,blocks much cleaner much litter trash cans ful...
57,much potiential
76,nice park care grounds outremont dissapointing...


In [25]:
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [26]:
cv_noun_adj = CountVectorizer(stop_words = stop_words) # max_df=.8 means "It ignores terms that appear in more than 80% of the documents".
data_cv_noun_adj = cv_noun_adj.fit_transform(data_nouns_adj['reviews']).toarray()
features_noun_adj = cv_noun_adj.get_feature_names()

In [27]:
df3 = pd.DataFrame(data_cv_noun_adj , columns = features_noun_adj)
term_document_matrix_noun_adj = df3.transpose()
term_document_matrix_noun_adj
sparse_matrix_noun_adj = scipy.sparse.csr_matrix(term_document_matrix_noun_adj)
corpus_noun_adj = matutils.Sparse2Corpus(sparse_matrix_noun_adj)

In [28]:
id2word_noun_adj = corpora.Dictionary([features_noun_adj])

In [29]:
# Number of Topics = 2
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=2, passes=80)
lda_noun_adj.print_topics()

[(0,
  '0.037*"park" + 0.027*"kids" + 0.023*"water" + 0.017*"nice" + 0.017*"little" + 0.012*"good" + 0.011*"small" + 0.010*"place" + 0.010*"great" + 0.010*"picnic"'),
 (1,
  '0.016*"park" + 0.014*"good" + 0.013*"nice" + 0.010*"small" + 0.010*"theres" + 0.008*"view" + 0.007*"visitors" + 0.007*"food" + 0.007*"middle" + 0.007*"forest"')]

In [30]:
# Number of Topics = 3
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=3, passes=80)
lda_noun_adj.print_topics()

[(0,
  '0.037*"park" + 0.031*"kids" + 0.022*"nice" + 0.020*"little" + 0.017*"good" + 0.012*"parents" + 0.009*"theres" + 0.009*"views" + 0.009*"space" + 0.009*"water"'),
 (1,
  '0.022*"park" + 0.022*"small" + 0.014*"water" + 0.012*"great" + 0.011*"view" + 0.011*"nice" + 0.010*"kids" + 0.009*"point" + 0.009*"good" + 0.008*"bad"'),
 (2,
  '0.024*"park" + 0.015*"nice" + 0.014*"good" + 0.013*"water" + 0.012*"picnic" + 0.012*"tables" + 0.012*"little" + 0.010*"area" + 0.010*"place" + 0.008*"entrance"')]

In [31]:
# Number of Topics = 10
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=10, passes=80)
lda_noun_adj.print_topics()

[(0,
  '0.024*"nice" + 0.024*"water" + 0.016*"kids" + 0.016*"place" + 0.016*"station" + 0.008*"middle" + 0.008*"montreal" + 0.008*"green" + 0.008*"train" + 0.008*"spot"'),
 (1,
  '0.039*"park" + 0.032*"good" + 0.024*"views" + 0.016*"little" + 0.016*"bit" + 0.016*"visitors" + 0.016*"way" + 0.016*"forest" + 0.016*"water" + 0.008*"bad"'),
 (2,
  '0.033*"view" + 0.033*"point" + 0.025*"small" + 0.017*"good" + 0.017*"person" + 0.017*"help" + 0.017*"use" + 0.017*"forest" + 0.017*"food" + 0.017*"beach"'),
 (3,
  '0.037*"park" + 0.022*"area" + 0.022*"entrance" + 0.016*"little" + 0.016*"good" + 0.016*"picnic" + 0.016*"tables" + 0.016*"small" + 0.011*"careful" + 0.011*"walk"'),
 (4,
  '0.018*"good" + 0.014*"nice" + 0.014*"day" + 0.014*"information" + 0.014*"bad" + 0.014*"kids" + 0.014*"park" + 0.009*"french" + 0.009*"description" + 0.009*"drones"'),
 (5,
  '0.038*"park" + 0.022*"nice" + 0.022*"water" + 0.016*"walk" + 0.016*"kids" + 0.011*"place" + 0.011*"look" + 0.011*"concrete" + 0.011*"visit" +

### Applying on all the reviews dataset

In [32]:
df_all = pd.read_csv(r"C:\Users\Abhay Mahajan\Downloads\ParkReviewsLang.csv")

In [33]:
df_all.head(10)

Unnamed: 0.1,Unnamed: 0,review_for,review_id,username,user_url,published,date_retrieved,num_stars,num_reviews,review_text,lang
0,0,Parc de la Capture-d'Ethan-Allen,ChdDSUhNMG9nS0VJQ0FnSUNpeGF6TTNnRRAB,Claudia,https://www.google.com/maps/contrib/1001449741...,7 months ago,2021-06-20 22:04:09.211296,4.0,107.0,One of the nicest entry points to this invitin...,en
1,1,Parc de la Capture-d'Ethan-Allen,ChdDSUhNMG9nS0VJQ0FnSURDOGEyMGpnRRAB,Nate Neel,https://www.google.com/maps/contrib/1121030547...,8 months ago,2021-06-20 22:04:09.212245,5.0,121.0,"Waterfront to fish or just relax, great place ...",en
2,2,Parc de la Capture-d'Ethan-Allen,ChdDSUhNMG9nS0VJQ0FnSUM4Nk9Ya3lnRRAB,Yucel Salimoglu,https://www.google.com/maps/contrib/1034180738...,11 months ago,2021-06-20 22:04:09.213178,4.0,79.0,Everything except the parking is good here.,en
3,3,Parc de la Capture-d'Ethan-Allen,ChZDSUhNMG9nS0VJQ0FnSUNVdWNUbE9REAE,COCO BEADZ,https://www.google.com/maps/contrib/1036060504...,a year ago,2021-06-20 22:04:09.214115,4.0,128.0,"Defenely the best park in Montreal East, Tetre...",en
4,4,Parc de la Capture-d'Ethan-Allen,ChdDSUhNMG9nS0VJQ0FnSUMwdHJDTm1nRRAB,Anna Maria Fiore,https://www.google.com/maps/contrib/1016779009...,a year ago,2021-06-20 22:04:09.215069,5.0,39.0,It's so peaceful and happy place near the water,en
5,5,Parc de la Capture-d'Ethan-Allen,ChZDSUhNMG9nS0VJQ0FnSUNpOWJHZ0lREAE,John Ronald César,https://www.google.com/maps/contrib/1058840136...,7 months ago,2021-06-20 22:04:09.216203,5.0,33.0,🌄Nice🥰,en
6,6,Parc de la Capture-d'Ethan-Allen,ChZDSUhNMG9nS0VJQ0FnSURraTZQc1hREAE,sebastien geoffrion,https://www.google.com/maps/contrib/1028952244...,a year ago,2021-06-20 22:04:09.217132,4.0,24.0,"Great view, clean and young children friendly.",en
7,7,Parc de la Capture-d'Ethan-Allen,ChZDSUhNMG9nS0VJQ0FnSUNncE5iNER3EAE,СВАНГА Сид,https://www.google.com/maps/contrib/1009950037...,3 years ago,2021-06-20 22:04:09.218052,5.0,4.0,Super!!!,en
8,8,Parc de la Capture-d'Ethan-Allen,ChdDSUhNMG9nS0VJQ0FnSUMwc0lTUC1BRRAB,Nathalie,https://www.google.com/maps/contrib/1158598595...,a year ago,2021-06-20 22:04:09.218970,4.0,30.0,Great to relax alone or with family!,en
9,9,Parc de la Capture-d'Ethan-Allen,ChdDSUhNMG9nS0VJQ0FnSURBXzUtdXF3RRAB,Alexandre,https://www.google.com/maps/contrib/1032303720...,3 years ago,2021-06-20 22:04:09.220022,5.0,56.0,Great park. It's very quiet and peaceful.,en


In [34]:
df_all_eng = df_all[df_all['lang'] == 'en']
df_all_eng.shape

(23032, 11)

In [35]:
df_all_neg = df_all_eng[df_all_eng['num_stars'] < 4]
df_all_neg.shape

(2248, 11)

### Applying nouns function Attempt#4

In [36]:
df_all_neg = pd.DataFrame(df_all_neg['review_text'].apply(nouns))

In [37]:
cv_noun = CountVectorizer(stop_words = stop_words)
data_cv_noun = cv_noun.fit_transform(df_all_neg['review_text']).toarray()
features_noun = cv_noun.get_feature_names()

In [38]:
data_cv_noun.shape

(2248, 2506)

In [39]:
df5 = pd.DataFrame(data_cv_noun , columns = features_noun)

In [40]:
term_document_matrix_noun = df5.transpose()
term_document_matrix_noun
sparse_matrix_noun = scipy.sparse.csr_matrix(term_document_matrix_noun)
corpus_noun = matutils.Sparse2Corpus(sparse_matrix_noun)

In [41]:
id2word_noun = corpora.Dictionary([features_noun])

In [42]:
# Number of Topics = 2
lda_noun = models.LdaModel(corpus=corpus_noun, id2word=id2word_noun, num_topics=2, passes=40)
lda_noun.print_topics()

[(0,
  '0.056*"park" + 0.042*"place" + 0.020*"parc" + 0.020*"nice" + 0.017*"water" + 0.014*"kids" + 0.012*"google" + 0.010*"fountain" + 0.009*"day" + 0.009*"original"'),
 (1,
  '0.085*"park" + 0.025*"kids" + 0.012*"area" + 0.011*"google" + 0.010*"nice" + 0.010*"dogs" + 0.009*"dog" + 0.009*"trees" + 0.009*"city" + 0.009*"playground"')]

In [43]:
# Number of Topics = 3
lda_noun = models.LdaModel(corpus=corpus_noun, id2word=id2word_noun, num_topics=3, passes=40)
lda_noun.print_topics()

[(0,
  '0.097*"park" + 0.036*"place" + 0.032*"kids" + 0.021*"water" + 0.014*"lots" + 0.012*"summer" + 0.010*"parking" + 0.009*"dog" + 0.009*"pool" + 0.009*"trees"'),
 (1,
  '0.074*"park" + 0.038*"nice" + 0.023*"parc" + 0.022*"area" + 0.015*"space" + 0.014*"kids" + 0.012*"day" + 0.011*"bit" + 0.010*"fountain" + 0.010*"dogs"'),
 (2,
  '0.037*"google" + 0.034*"original" + 0.025*"place" + 0.022*"park" + 0.011*"garbage" + 0.011*"great" + 0.008*"parks" + 0.008*"fun" + 0.007*"spot" + 0.007*"baseball"')]

In [44]:
# Number of Topics = 10
lda_noun = models.LdaModel(corpus=corpus_noun, id2word=id2word_noun, num_topics=10, passes=40)
lda_noun.print_topics()

[(0,
  '0.113*"place" + 0.067*"park" + 0.038*"nice" + 0.029*"water" + 0.022*"kids" + 0.019*"bit" + 0.016*"montreal" + 0.016*"houses" + 0.015*"family" + 0.015*"lots"'),
 (1,
  '0.053*"park" + 0.027*"kids" + 0.021*"years" + 0.020*"fountain" + 0.017*"place" + 0.016*"parks" + 0.014*"pool" + 0.013*"great" + 0.011*"grass" + 0.011*"suit"'),
 (2,
  '0.068*"lots" + 0.054*"space" + 0.044*"garbage" + 0.020*"squirrels" + 0.020*"dirty" + 0.019*"pretty" + 0.018*"slide" + 0.017*"event" + 0.017*"walking" + 0.016*"square"'),
 (3,
  '0.089*"google" + 0.080*"original" + 0.062*"parc" + 0.036*"park" + 0.026*"cool" + 0.025*"great" + 0.017*"tennis" + 0.015*"super" + 0.014*"summer" + 0.013*"play"'),
 (4,
  '0.154*"park" + 0.034*"dog" + 0.026*"dogs" + 0.015*"area" + 0.015*"beautiful" + 0.014*"food" + 0.013*"small" + 0.013*"place" + 0.011*"path" + 0.010*"street"'),
 (5,
  '0.109*"park" + 0.034*"trees" + 0.029*"parking" + 0.028*"playground" + 0.026*"baseball" + 0.025*"water" + 0.024*"kids" + 0.023*"lot" + 0.019*

### Applying Nouns and Adjectives Function & Stemming

In [110]:
df_all_neg = df_all_eng[df_all_eng['num_stars'] < 4]
df_all_neg = pd.DataFrame(df_all_neg['review_text'].apply(nouns_adj))

In [111]:
df_all_neg.shape

(2248, 1)

In [112]:
ps = nltk.stem.PorterStemmer()

In [113]:
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [ps.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [114]:
df_all_neg['review_text'] = df_all_neg['review_text'].apply(stem_sentences)

In [115]:
df_all_neg.head()

Unnamed: 0,review_text
13,beer st-laurenc ok
15,correct
18,trè beau me pouvoir descendr au bord pour baig...
23,park squar nice place bu cool shade few seat a...
30,field uneven risk player hous OK conveni users...


In [131]:
add_stop_words = ['translated' , 'googl']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [132]:
cv_noun_adj = CountVectorizer(stop_words = stop_words)
data_cv_noun_adj = cv_noun_adj.fit_transform(df_all_neg['review_text']).toarray()
features_noun_adj = cv_noun_adj.get_feature_names()

In [133]:
data_cv_noun_adj.shape

(2248, 2615)

In [134]:
df6 = pd.DataFrame(data_cv_noun_adj , columns = features_noun_adj)

In [135]:
term_document_matrix_noun_adj = df6.transpose()
term_document_matrix_noun_adj
sparse_matrix_noun_adj = scipy.sparse.csr_matrix(term_document_matrix_noun_adj)
corpus_noun_adj = matutils.Sparse2Corpus(sparse_matrix_noun_adj)

In [136]:
id2word_noun_adj = corpora.Dictionary([features_noun_adj])

In [137]:
# Number of Topics = 2
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=2, passes=10)
lda_noun_adj.print_topics()

[(0,
  '0.096*"park" + 0.041*"nice" + 0.028*"place" + 0.019*"kid" + 0.016*"peopl" + 0.016*"good" + 0.014*"water" + 0.013*"lot" + 0.013*"dog" + 0.012*"small"'),
 (1,
  '0.020*"origin" + 0.017*"parc" + 0.012*"great" + 0.011*"kid" + 0.009*"cool" + 0.009*"mani" + 0.008*"field" + 0.008*"basebal" + 0.007*"park" + 0.007*"fun"')]

In [138]:
gensim.models.coherencemodel.CoherenceModel(corpus = corpus_noun_adj , model=lda_noun_adj, coherence='u_mass').get_coherence()

-6.028279613643649

In [139]:
# Number of Topics = 3
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=3, passes=40)
lda_noun_adj.print_topics()

[(0,
  '0.025*"good" + 0.015*"cool" + 0.015*"lot" + 0.014*"place" + 0.012*"parc" + 0.011*"nice" + 0.011*"kid" + 0.010*"green" + 0.009*"trash" + 0.009*"care"'),
 (1,
  '0.095*"park" + 0.042*"nice" + 0.030*"place" + 0.019*"kid" + 0.018*"water" + 0.016*"small" + 0.015*"dog" + 0.014*"origin" + 0.012*"mani" + 0.012*"peopl"'),
 (2,
  '0.054*"park" + 0.018*"ok" + 0.017*"great" + 0.016*"lot" + 0.015*"peopl" + 0.015*"kid" + 0.011*"nice" + 0.009*"day" + 0.009*"fun" + 0.009*"garbag"')]

In [140]:
gensim.models.coherencemodel.CoherenceModel(corpus = corpus_noun_adj , model=lda_noun_adj, coherence='u_mass').get_coherence()

-5.762279960992049

In [141]:
# Number of Topics = 6
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=6, passes=40)
lda_noun_adj.print_topics()

[(0,
  '0.046*"dog" + 0.037*"park" + 0.037*"good" + 0.033*"ok" + 0.028*"parc" + 0.026*"place" + 0.015*"area" + 0.015*"great" + 0.013*"day" + 0.011*"kid"'),
 (1,
  '0.088*"park" + 0.048*"peopl" + 0.043*"kid" + 0.029*"nice" + 0.018*"mani" + 0.017*"water" + 0.015*"lot" + 0.013*"parc" + 0.013*"homeless" + 0.012*"place"'),
 (2,
  '0.032*"park" + 0.019*"time" + 0.017*"beauti" + 0.017*"better" + 0.014*"littl" + 0.013*"bench" + 0.012*"year" + 0.011*"tree" + 0.011*"horribl" + 0.010*"place"'),
 (3,
  '0.085*"nice" + 0.070*"park" + 0.043*"place" + 0.030*"small" + 0.016*"littl" + 0.013*"kid" + 0.012*"view" + 0.011*"good" + 0.011*"walk" + 0.010*"fountain"'),
 (4,
  '0.046*"origin" + 0.034*"park" + 0.028*"great" + 0.026*"field" + 0.022*"good" + 0.019*"cool" + 0.016*"place" + 0.015*"soccer" + 0.014*"basebal" + 0.012*"spot"'),
 (5,
  '0.087*"park" + 0.026*"nice" + 0.020*"lot" + 0.017*"bad" + 0.017*"kid" + 0.014*"clean" + 0.014*"pool" + 0.013*"water" + 0.013*"dirti" + 0.012*"garbag"')]

In [142]:
gensim.models.coherencemodel.CoherenceModel(corpus = corpus_noun_adj , model=lda_noun_adj, coherence='u_mass').get_coherence()

-6.810201413275126

In [143]:
# Number of Topics = 10
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, id2word=id2word_noun_adj, num_topics=10, passes=40)
lda_noun_adj.print_topics()

[(0,
  '0.089*"park" + 0.087*"nice" + 0.044*"origin" + 0.035*"place" + 0.028*"ok" + 0.028*"lot" + 0.022*"cool" + 0.021*"clean" + 0.016*"littl" + 0.015*"space"'),
 (1,
  '0.065*"park" + 0.043*"dog" + 0.029*"bad" + 0.025*"bit" + 0.023*"nice" + 0.023*"kid" + 0.017*"place" + 0.014*"littl" + 0.014*"time" + 0.014*"young"'),
 (2,
  '0.072*"park" + 0.034*"place" + 0.024*"time" + 0.021*"noth" + 0.018*"quiet" + 0.017*"walk" + 0.014*"water" + 0.013*"special" + 0.013*"calm" + 0.012*"care"'),
 (3,
  '0.056*"field" + 0.049*"park" + 0.032*"parc" + 0.032*"soccer" + 0.028*"beauti" + 0.024*"great" + 0.022*"basebal" + 0.020*"correct" + 0.019*"pool" + 0.019*"suit"'),
 (4,
  '0.026*"area" + 0.024*"trail" + 0.017*"access" + 0.016*"activ" + 0.016*"mai" + 0.014*"traffic" + 0.014*"parc" + 0.014*"car" + 0.013*"highway" + 0.012*"aussi"'),
 (5,
  '0.086*"park" + 0.028*"nice" + 0.022*"kid" + 0.017*"winter" + 0.017*"skate" + 0.015*"picnic" + 0.014*"bathroom" + 0.014*"great" + 0.013*"pool" + 0.013*"children"'),
 (6,

In [144]:
gensim.models.coherencemodel.CoherenceModel(corpus = corpus_noun_adj , model=lda_noun_adj, coherence='u_mass').get_coherence()

-8.127178567399499