In [12]:
import pandas as pd
import numpy as np
import re, string

import nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
  
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import gensim
from gensim import corpora

from collections import Counter
import string

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer

import zipfile
import os

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mariavasilenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mariavasilenko/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


### Data pre-processing

- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
- Words that have fewer than 3 characters are removed.
- All stopwords are removed.
 - Words are __lemmatized__ — words in third person are changed to first person and verbs in past and future tenses are changed into present.
- Words are __stemmed__ — words are reduced to their root form.


In [125]:
def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text.
    Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3.
    """
    
    text=text.lower() #Convert everything to lowercase
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words =  nltk.word_tokenize(nopunct) # tokenizing words
    words = [w for w in words if len(w) > 3]  # Drop words less than length 3
    words = [w for w in words if w not in ENGLISH_STOP_WORDS] # Removes stop words using SciKit-Learn's ENGLISH_STOP_WORDS set.
    words = [w.encode('ascii','ignore') for w in words]
    return words

def stemwords(words):
    """
    Given a list of tokens/words, return a new list with each word
    stemmed using a PorterStemmer.
    """
    stemmer = PorterStemmer()
    words = [w.decode('ascii','ignore') for w in words]
    stemmed = [stemmer.stem(w) for w in words]
    return stemmed

def lemmatize (tokens):
    '''
    Given a list of tokens/words, return a list of lemmatized words 
    i.e. words in third person are changed to first person and verbs in past,
    and future tenses are changed into present.
    '''
    text = " ".join(tokens)
    lemmatizer = WordNetLemmatizer()
    
    return lemmatizer.lemmatize(text).split()
    
    
def tokenizer(text):
    
    return stemwords(tokenize(text))


In [126]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

In [127]:
lemmatize(stemwords(tokenize(doc1)))

['sugar', 'consum', 'sister', 'like', 'sugar', 'father']

In [128]:
tokenizer(doc1)

['sugar', 'consum', 'sister', 'like', 'sugar', 'father']

In [129]:
df_reviews = pd.read_csv('data/appstore_all_reviews.csv')

In [62]:
df_reviews.head()

Unnamed: 0.1,Unnamed: 0,name,id,title,author_name,author_uri,voteSum,voteCount,rating,text,date,review_id
0,1,Microsoft HealthVault,546835834,Lab Corp blood results,liver transplant patient,https://itunes.apple.com/us/reviews/id782157250,0,0,1,Lab Corp had my weekly blood work results on t...,2018-08-08 05:03:25,https://itunes.apple.com/us/reviews/id78215725...
1,2,Microsoft HealthVault,546835834,What happened,Bistline,https://itunes.apple.com/us/reviews/id335994415,0,0,3,This app used to be my favorite. It would sync...,2018-06-27 14:07:45,https://itunes.apple.com/us/reviews/id33599441...
2,3,Microsoft HealthVault,546835834,Great idea,Gdb&&@,https://itunes.apple.com/us/reviews/id216415940,0,0,5,I have many yrs worth of data now stored. It i...,2018-06-27 10:26:12,https://itunes.apple.com/us/reviews/id21641594...
3,5,Microsoft HealthVault,546835834,Bugs after 11/14/17 update,Agw54,https://itunes.apple.com/us/reviews/id115506088,0,0,1,Worked ok until 11/14/17 update. Now you can'...,2018-03-19 06:34:58,https://itunes.apple.com/us/reviews/id11550608...
4,6,Microsoft HealthVault,546835834,HeathVault,rlbarkley,https://itunes.apple.com/us/reviews/id202749603,0,0,2,I am disappointed with App because MS started ...,2018-02-05 23:30:49,https://itunes.apple.com/us/reviews/id20274960...


In [130]:
reviews_text = df_reviews['text']

In [131]:
text = list(reviews_text)

In [132]:
text

['Lab Corp had my weekly blood work results on the last app Up two months ago earlier in 2018, since they forced everyone to open up the HealthVault app to get results I have not been able to get one result and I just found out that they do not supply results to iPhones five, six and seven. After spending hours checking I found out that you need an iPhone 8 or 10 and that is ridiculous, it had been working for three years just fine and I can guarantee you that regular hard-working people cannot afford the latest phones. This is a terrible new system which excludes The majority of regular people with regular phones. I am a transplant patient and I need to keep up with my blood work results frequently since many levels such as my potassium could cause me heart problems from one week to another. I hope someone gets back in touch with me or I will be switching lab companies who can supply a simple task as supplying blood work results which I pay for.',
 'This app used to be my favorite. It

In [133]:
# Prepare the corpus
clean_text = [tokenizer(t) for t in text]

In [148]:
text[0]

'Lab Corp had my weekly blood work results on the last app Up two months ago earlier in 2018, since they forced everyone to open up the HealthVault app to get results I have not been able to get one result and I just found out that they do not supply results to iPhones five, six and seven. After spending hours checking I found out that you need an iPhone 8 or 10 and that is ridiculous, it had been working for three years just fine and I can guarantee you that regular hard-working people cannot afford the latest phones. This is a terrible new system which excludes The majority of regular people with regular phones. I am a transplant patient and I need to keep up with my blood work results frequently since many levels such as my potassium could cause me heart problems from one week to another. I hope someone gets back in touch with me or I will be switching lab companies who can supply a simple task as supplying blood work results which I pay for.'

In [147]:
clean_text[0]

['corp',
 'weekli',
 'blood',
 'work',
 'result',
 'month',
 'earlier',
 'forc',
 'open',
 'healthvault',
 'result',
 'abl',
 'result',
 'just',
 'suppli',
 'result',
 'iphon',
 'seven',
 'spend',
 'hour',
 'check',
 'need',
 'iphon',
 'ridicul',
 'work',
 'year',
 'just',
 'fine',
 'guarante',
 'regular',
 'hard',
 'work',
 'peopl',
 'afford',
 'latest',
 'phone',
 'terribl',
 'exclud',
 'major',
 'regular',
 'peopl',
 'regular',
 'phone',
 'transplant',
 'patient',
 'need',
 'blood',
 'work',
 'result',
 'frequent',
 'level',
 'potassium',
 'caus',
 'heart',
 'problem',
 'week',
 'hope',
 'get',
 'touch',
 'switch',
 'compani',
 'suppli',
 'simpl',
 'task',
 'suppli',
 'blood',
 'work',
 'result']

In [134]:
#Creating the dictionary out of corpus
dictionary = corpora.Dictionary(clean_text)

In [135]:
len(dictionary)

3930

In [136]:
count = 0
for k,v in dictionary.iteritems():
    print(k,v)
    count +=1
    if count >10:
        break
    

0 abl
1 afford
2 blood
3 caus
4 check
5 compani
6 corp
7 earlier
8 exclud
9 fine
10 forc


In [139]:
# Filter extremes:
# Filter tokens that appear in less than 10 doc-s
# Filter out tokens that appear in more than 50% of docs
dictionary.filter_extremes(no_below=3, no_above=0.5)

In [140]:
len(dictionary)

1346

In [141]:
# Convert list of documents (corpus) to Doc Term Matrix
doc_term_mx = [dictionary.doc2bow(doc) for doc in clean_text]

In [144]:
lda = gensim.models.ldamodel.LdaModel(corpus=doc_term_mx, \
                                      id2word=dictionary, num_topics=10, \
                                      update_every=1, chunksize=100, passes=50)

In [145]:
lda.print_topics()

[(0,
  '0.092*"claim" + 0.083*"work" + 0.049*"download" + 0.047*"updat" + 0.040*"doesn" + 0.039*"version" + 0.026*"issu" + 0.026*"review" + 0.023*"open" + 0.023*"time"'),
 (1,
  '0.049*"login" + 0.046*"time" + 0.035*"work" + 0.033*"just" + 0.030*"use" + 0.025*"screen" + 0.024*"updat" + 0.024*"useless" + 0.023*"load" + 0.017*"support"'),
 (2,
  '0.044*"love" + 0.044*"easi" + 0.039*"inform" + 0.037*"great" + 0.036*"have" + 0.030*"medic" + 0.030*"need" + 0.030*"doctor" + 0.025*"help" + 0.022*"access"'),
 (3,
  '0.068*"sign" + 0.054*"reason" + 0.050*"button" + 0.043*"navig" + 0.042*"comput" + 0.038*"long" + 0.035*"page" + 0.030*"quickli" + 0.024*"design" + 0.020*"set"'),
 (4,
  '0.077*"appoint" + 0.054*"prescript" + 0.053*"like" + 0.044*"good" + 0.031*"health" + 0.031*"featur" + 0.022*"basic" + 0.022*"schedul" + 0.021*"better" + 0.020*"make"'),
 (5,
  '0.060*"doctor" + 0.052*"view" + 0.044*"search" + 0.037*"care" + 0.032*"provid" + 0.026*"number" + 0.025*"function" + 0.020*"user" + 0.020*"