In [1]:
import pandas as pd
import numpy as np
import re, string

import nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
  
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import gensim
from gensim import corpora

from collections import Counter
import string

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer

import zipfile
import os

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mariavasilenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mariavasilenko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Data pre-processing

- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
- Words that have fewer than 3 characters are removed.
- All stopwords are removed.
 - Words are __lemmatized__ — words in third person are changed to first person and verbs in past and future tenses are changed into present.
- Words are __stemmed__ — words are reduced to their root form.


In [2]:
def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text.
    Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3.
    """
    
    text=text.lower() #Convert everything to lowercase
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words =  nltk.word_tokenize(nopunct) # tokenizing words
    words = [w for w in words if len(w) > 3]  # Drop words less than length 3
    words = [w for w in words if w not in ENGLISH_STOP_WORDS] # Removes stop words using SciKit-Learn's ENGLISH_STOP_WORDS set.
    words = [w.encode('ascii','ignore') for w in words]
    return words

def stemwords(words):
    """
    Given a list of tokens/words, return a new list with each word
    stemmed using a PorterStemmer.
    """
    stemmer = PorterStemmer()
    words = [w.decode('ascii','ignore') for w in words]
    stemmed = [stemmer.stem(w) for w in words]
    return stemmed

def lemmatize (tokens):
    '''
    Given a list of tokens/words, return a list of lemmatized words 
    i.e. words in third person are changed to first person and verbs in past,
    and future tenses are changed into present.
    '''
    text = " ".join(tokens)
    lemmatizer = WordNetLemmatizer()
    
    return lemmatizer.lemmatize(text).split()
    
    
def tokenizer(text):
    
    return stemwords(tokenize(text))


In [3]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

In [4]:
df_reviews = pd.read_csv('data/appstore_all_reviews.csv')

In [6]:
df_reviews.tail()

Unnamed: 0.1,Unnamed: 0,name,id,title,author_name,author_uri,voteSum,voteCount,rating,text,date,review_id
4361,3768,Anthem Anywhere,589443627,Wallet is slimmer,nowherepeople,https://itunes.apple.com/us/reviews/id112855409,0,0,4,good app. let me take another item out of my w...,2016-10-03 04:16:11,https://itunes.apple.com/us/reviews/id11285540...
4362,3769,Anthem Anywhere,589443627,Great communication,Bill S - KC,https://itunes.apple.com/us/reviews/id32122191,0,0,5,Great having an app to keep up with insurance ...,2016-02-10 10:14:39,https://itunes.apple.com/us/reviews/id32122191...
4363,3770,Anthem Anywhere,589443627,Info at fingertips,Rviv'd,https://itunes.apple.com/us/reviews/id217661497,0,0,4,Provides the access and information about bene...,2015-09-02 08:00:39,https://itunes.apple.com/us/reviews/id21766149...
4364,3773,Anthem Anywhere,589443627,Super Handy,CocoBean8112,https://itunes.apple.com/us/reviews/id32458709,0,0,5,I'm loving this app so far. It's great having ...,2016-08-01 05:34:05,https://itunes.apple.com/us/reviews/id32458709...
4365,3774,Anthem Anywhere,589443627,Location aware severely limits app,max egami,https://itunes.apple.com/us/reviews/id12609792,0,0,1,"Want to find that urgent care facility nearby,...",2015-12-28 11:53:33,https://itunes.apple.com/us/reviews/id12609792...


In [7]:
reviews_text = df_reviews['text']

In [8]:
text = list(reviews_text)

In [9]:
text

['Lab Corp had my weekly blood work results on the last app Up two months ago earlier in 2018, since they forced everyone to open up the HealthVault app to get results I have not been able to get one result and I just found out that they do not supply results to iPhones five, six and seven. After spending hours checking I found out that you need an iPhone 8 or 10 and that is ridiculous, it had been working for three years just fine and I can guarantee you that regular hard-working people cannot afford the latest phones. This is a terrible new system which excludes The majority of regular people with regular phones. I am a transplant patient and I need to keep up with my blood work results frequently since many levels such as my potassium could cause me heart problems from one week to another. I hope someone gets back in touch with me or I will be switching lab companies who can supply a simple task as supplying blood work results which I pay for.',
 'This app used to be my favorite. It

In [10]:
# Prepare the corpus
clean_text = [tokenizer(t) for t in text]

In [11]:
text[0]

'Lab Corp had my weekly blood work results on the last app Up two months ago earlier in 2018, since they forced everyone to open up the HealthVault app to get results I have not been able to get one result and I just found out that they do not supply results to iPhones five, six and seven. After spending hours checking I found out that you need an iPhone 8 or 10 and that is ridiculous, it had been working for three years just fine and I can guarantee you that regular hard-working people cannot afford the latest phones. This is a terrible new system which excludes The majority of regular people with regular phones. I am a transplant patient and I need to keep up with my blood work results frequently since many levels such as my potassium could cause me heart problems from one week to another. I hope someone gets back in touch with me or I will be switching lab companies who can supply a simple task as supplying blood work results which I pay for.'

In [13]:
#clean_text[0]

In [30]:
#Creating the dictionary out of corpus
dictionary = corpora.Dictionary(clean_text)

In [31]:
len(dictionary)

3930

In [32]:
count = 0
for k,v in dictionary.iteritems():
    print(k,v)
    count +=1
    if count >10:
        break
    

0 abl
1 afford
2 blood
3 caus
4 check
5 compani
6 corp
7 earlier
8 exclud
9 fine
10 forc


In [33]:
# Filter extremes:
# Filter tokens that appear in less than 10 doc-s
# Filter out tokens that appear in more than 50% of docs
dictionary.filter_extremes(no_below=2, no_above=0.9)

In [34]:
len(dictionary)

2269

In [35]:
# Convert list of documents (corpus) to Doc Term Matrix
doc_term_mx = [dictionary.doc2bow(doc) for doc in clean_text]

In [36]:
# See how the 10th document looks like
print(doc_term_mx[10])

[(14, 1), (24, 1), (44, 1), (71, 1), (107, 1), (137, 1), (138, 1), (139, 1), (140, 1), (141, 1), (142, 1), (143, 1), (144, 1)]


In [37]:
#Build the LDA model

NUM_TOPICS = 5
lda = gensim.models.ldamodel.LdaModel(corpus=doc_term_mx, \
                                      id2word=dictionary, num_topics=NUM_TOPICS) \
                                      #, update_every=1, chunksize=100, passes=50)

In [27]:
lda.print_topics()

[(0,
  '0.036*"work" + 0.022*"time" + 0.019*"just" + 0.015*"websit" + 0.015*"cigna" + 0.013*"download" + 0.013*"use" + 0.013*"custom" + 0.011*"doesn" + 0.010*"tri"'),
 (1,
  '0.018*"card" + 0.017*"work" + 0.013*"just" + 0.012*"claim" + 0.011*"doctor" + 0.011*"make" + 0.011*"like" + 0.010*"need" + 0.010*"anthem" + 0.010*"messag"'),
 (2,
  '0.034*"card" + 0.026*"insur" + 0.023*"password" + 0.018*"need" + 0.012*"abl" + 0.011*"access" + 0.011*"use" + 0.010*"work" + 0.010*"like" + 0.010*"login"'),
 (3,
  '0.025*"doctor" + 0.022*"time" + 0.012*"need" + 0.011*"info" + 0.010*"search" + 0.010*"inform" + 0.010*"help" + 0.009*"check" + 0.009*"care" + 0.009*"appoint"'),
 (4,
  '0.025*"easi" + 0.024*"inform" + 0.020*"great" + 0.020*"love" + 0.018*"access" + 0.016*"medic" + 0.015*"work" + 0.013*"like" + 0.013*"use" + 0.012*"claim"')]

In [41]:
# Try modeling with sklearn

from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 5
 
vectorizer = CountVectorizer(min_df = 3, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             tokenizer = tokenizer)
data_vectorized = vectorizer.fit_transform(text)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components = NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 

(4366, 5)
(4366, 5)


In [40]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('work', 672.4719072010878), ('password', 592.9665074870665), ('time', 449.61144151058636), ('login', 417.207743123129), ('just', 359.5437617609089), ('updat', 323.11602464313785), ('tri', 311.9434961497887), ('use', 270.6504073374748), ('past', 260.3578903446355), ('open', 233.5431735162286)]
Topic 1:
[('work', 288.4787137524516), ('time', 262.59416479116635), ('error', 239.21731245153515), ('websit', 224.47450894285768), ('just', 195.35436646432998), ('load', 184.24503777822886), ('page', 144.94574204157092), ('messag', 142.4460061262463), ('mobil', 125.4568116905868), ('aetna', 124.42100990209543)]
Topic 2:
[('doctor', 574.8117837902267), ('need', 510.3904064616173), ('medic', 435.51248284127814), ('easi', 427.54461571079395), ('great', 427.1485747136153), ('love', 418.6143207786398), ('like', 407.633332521176), ('inform', 395.12643788028663), ('appoint', 387.5851844431074), ('use', 311.9638970597561)]
Topic 3:
[('card', 659.5366368734651), ('claim', 395.0528401

In [43]:
# Plotting results
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()


In [46]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)

df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(text))

source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
text_font_size="8pt", text_color="#555555",
source=source, text_align='center')
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True);



In [47]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True);

In [50]:
vectorizer = CountVectorizer(min_df = 3, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             tokenizer = tokenizer)
data_vectorized = vectorizer.fit_transform(text)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components = NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 

(4366, 10)


In [51]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


Let’s interpret the topic visualization. Notice how topics are shown on the left while words are on the right. Here are the main things you should consider:

- Larger topics are more frequent in the corpus.
- Topics closer together are more similar, topics further apart are less similar.
- When you select a topic, you can see the most representative words for the selected topic. This measure can be a combination of how frequent or how discriminant the word is. You can adjust the weight of each property using the slider.
- Hovering over a word will adjust the topic sizes according to how representative the word is for the topic.

Source: <https://nlpforhackers.io/topic-modeling/>