In [131]:
reset -fs

In [132]:
import pandas as pd
import numpy as np
import re, string

import nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
  
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import gensim
from gensim import corpora

from collections import Counter
import string

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer

import zipfile
import os

import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mariavasilenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mariavasilenko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Data pre-processing

- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
- Words that have fewer than 3 characters are removed.
- All stopwords are removed.
 - Words are __lemmatized__ — words in third person are changed to first person and verbs in past and future tenses are changed into present.
- Words are __stemmed__ — words are reduced to their root form.


In [269]:
# Create the list of stopwords and extend it 

stopwords = list(ENGLISH_STOP_WORDS)
# Adding "just" after running basic LDA model
newStopWords = ['just', ""]
stopwords.extend(newStopWords)


def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text.
    Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3.
    """
    
    text=text.lower() #Convert everything to lowercase
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words =  nltk.word_tokenize(nopunct) # tokenizing words
    words = [w for w in words if len(w) > 3]  # Drop words less than length 3
    words[:] = [w for w in words if w != '']
    words = [w for w in words if w not in stopwords] # Removes stop words using SciKit-Learn's ENGLISH_STOP_WORDS set.
    words = [w.encode('ascii','ignore') for w in words]
    return words

def stemwords(words):
    """
    Given a list of tokens/words, return a new list with each word
    stemmed using a PorterStemmer.
    """
    stemmer = PorterStemmer()
    words = [w.decode('ascii','ignore') for w in words]
    stemmed = [stemmer.stem(w) for w in words]
    return stemmed

def lemmatize_stemming(words):
    words = [w.decode('ascii','ignore') for w in words]
    text = " ".join(words)
    stemmer = PorterStemmer()
    stemmed = stemmer.stem(WordNetLemmatizer().lemmatize(text))
    return stemmed.split()

def lemmatize (tokens):
    '''
    Given a list of tokens/words, return a list of lemmatized words 
    i.e. words in third person are changed to first person and verbs in past,
    and future tenses are changed into present.
    '''
    text = " ".join(tokens)
    lemmatizer = WordNetLemmatizer()
    
    return lemmatizer.lemmatize(text).split()
    
    
def tokenizer(text):
    
    return stemwords(tokenize(text))


In [273]:
df_reviews = pd.read_csv('data/appstore_googleplay_all_reviews.csv')

In [274]:
df_reviews.tail()

Unnamed: 0,name,id,title,author_name,author_uri,voteSum,voteCount,rating,text,date
6113,Google Fit: Health and Activity Tracking,com.google.android.apps.fitness,,Mohit G,107526919134556209809,,,1,This app does not work property on my one plus...,2018-11-18 15:03:20
6114,Google Fit: Health and Activity Tracking,com.google.android.apps.fitness,,sagar suryawanshi,101956979010176828886,,,1,I want to add leadboard How?,2018-11-18 14:54:32
6115,Medical Records App,com.cliniconline,,Abod Osama,115080537418668508998,,,4,A great useful application but how can I trans...,2018-11-19 06:49:38
6116,Medical Records App,com.cliniconline,,Agnes Novita,110614731832618616830,,,5,"Simple, easy, and useful",2018-11-18 03:48:23
6117,Medical Records App,com.cliniconline,,Alfredo Baruc,116943082422817884463,,,5,Very easy to use and is what I am looking for ...,2018-11-13 05:14:25


In [275]:
len(df_reviews)

6118

In [276]:
# Create a dataframe with negative reviews
df_neg_reviews = df_reviews[df_reviews['rating']<=3]

In [277]:
#Create dataframe with positive reviews
df_pos_reviews = df_reviews[df_reviews['rating']>3]

In [278]:
len(df_neg_reviews)

3735

In [279]:
len(df_pos_reviews)

2383

In [280]:
assert len(df_neg_reviews) + len(df_pos_reviews) == len(df_reviews)

In [281]:
# Extract text of negative reviews
neg_reviews_text = df_neg_reviews['text']
neg_text = list(neg_reviews_text)

In [282]:
# Extract the text of positive reviews
pos_reviews_text = df_pos_reviews['text']
pos_text = list(pos_reviews_text)

In [283]:
neg_clean_text = [tokenizer(n_t) for n_t in neg_text]

In [284]:
len(neg_clean_text)

3735

In [285]:
pos_clean_text = [tokenizer(p_t) for p_t in pos_text]

In [286]:
len(pos_clean_text)

2383

In [287]:
# Adding bigrams and trigrams to negative reviews

from gensim.models.phrases import Phrases, Phraser
# Add bigrams and trigrams to docs (only ones that appear 10 times or more).
bigram = Phraser(Phrases(neg_clean_text, min_count=10))
trigram = Phrases(bigram[neg_clean_text])

for idx in range(len(neg_clean_text)):
    for token in bigram[neg_clean_text[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            neg_clean_text[idx].append(token)
    for token in trigram[neg_clean_text[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            neg_clean_text[idx].append(token)



In [288]:
# Adding bigrams and trigrams to positive reviews

from gensim.models.phrases import Phrases, Phraser
# Add bigrams and trigrams to docs (only ones that appear 10 times or more).
bigram = Phraser(Phrases(pos_clean_text, min_count=10))
trigram = Phrases(bigram[pos_clean_text])

for idx in range(len(pos_clean_text)):
    for token in bigram[pos_clean_text[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            pos_clean_text[idx].append(token)
    for token in trigram[pos_clean_text[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            pos_clean_text[idx].append(token)

In [None]:
# Remove empty tokens

In [302]:
for item in neg_clean_text:
    item[:] = [elem for elem in item if elem != ""]


In [305]:
for item in pos_clean_text:
    item[:] = [elem for elem in item if elem != ""]


In [308]:
#Creating the dictionary out of corpus of negative reviews
neg_dictionary = corpora.Dictionary(neg_clean_text)

In [309]:
# Creating the dictionary out of corpus of positive reviews
pos_dictionary = corpora.Dictionary(pos_clean_text)

In [306]:
for idx, item in enumerate(pos_clean_text):
        if '' in item:
            print(idx, "YES")


In [310]:
len(neg_dictionary)

3867

In [311]:
len(pos_dictionary)

2845

In [312]:
# Hyperparameters

NUM_NEG_TOPICS =5 # Number of topics in negative reviews
NUM_POS_TOPICS =5 # Number of topics in positive reviews
# PArameters used in filtering the dictionaries
MIN_DF= 2 # remove tokens that appear in less than MIN_DF docs
MAX_DF = 0.5 # Remove tokens that appear in more than MAX_DF * 100% of docs 

In [313]:
# Filter extremes:
# Filter tokens that appear in less than 2 doc-s
# Filter out tokens that appear in more than 30% of docs

neg_dictionary.filter_extremes(no_below=MIN_DF, no_above= MAX_DF)
pos_dictionary.filter_extremes(no_below=MIN_DF, no_above= MAX_DF)


In [314]:
len(neg_dictionary)

2298

In [315]:
len(pos_dictionary)

1633

In [316]:
neg_doc_term_mx = [neg_dictionary.doc2bow(doc) for doc in neg_clean_text]

In [317]:
pos_doc_term_mx = [pos_dictionary.doc2bow(doc) for doc in pos_clean_text]

In [318]:
# LDA for negative reviews

neg_lda = gensim.models.ldamodel.LdaModel(corpus=neg_doc_term_mx, \
                                      id2word=neg_dictionary, num_topics=NUM_NEG_TOPICS, random_state = 17) \
                                      #, update_every=1, chunksize=100, passes=50)

In [319]:
# LDA for positive reviews

pos_lda = gensim.models.ldamodel.LdaModel(corpus=pos_doc_term_mx, \
                                      id2word=pos_dictionary, num_topics=NUM_POS_TOPICS, random_state = 17) \
                                      #, update_every=1, chunksize=100, passes=50)

In [320]:
# Topics based on negative reviews
neg_lda.print_topics(num_words=20)

[(0,
  '0.017*"card" + 0.016*"insur_card" + 0.014*"updat" + 0.013*"time" + 0.012*"work" + 0.010*"inform" + 0.009*"doe" + 0.009*"tri" + 0.009*"insur" + 0.009*"use" + 0.008*"crash" + 0.008*"version" + 0.007*"phone" + 0.007*"like" + 0.006*"custom_servic" + 0.006*"provid" + 0.006*"abl" + 0.006*"white_screen" + 0.006*"doctor" + 0.006*"iphon"'),
 (1,
  '0.022*"doesn_work" + 0.019*"work" + 0.014*"doesn" + 0.011*"need" + 0.010*"time" + 0.010*"messag" + 0.009*"like" + 0.009*"updat" + 0.008*"email" + 0.008*"make" + 0.007*"use" + 0.007*"card" + 0.007*"doctor" + 0.007*"version" + 0.007*"custom_servic" + 0.006*"data" + 0.006*"test_result" + 0.006*"user" + 0.006*"login" + 0.006*"want"'),
 (2,
  '0.029*"work" + 0.012*"urgent_care" + 0.012*"doctor" + 0.011*"doesn_work" + 0.011*"password" + 0.009*"doesn" + 0.009*"time" + 0.009*"search" + 0.009*"updat" + 0.009*"error_messag" + 0.009*"use" + 0.009*"doesnt_work" + 0.009*"health_insur" + 0.008*"login" + 0.007*"past_password" + 0.007*"useless" + 0.007*"wast

Negative topics:

- Topic 0: insurance card,  ==> TECHNICAL ISSUES: bad CUSTOMER SERVICE, ISSUES WITH PASSWORD
- Topic 1: Issues with messages, e-mails to doctor ==> MESSAGING TO DOCTOR
- Topic 2: URGENT CARE
- Topic 3: APPOINTMENT SCHEDULING
- Topic 4:  issues with LOGIN/PASSWORD

In [321]:
#Visualize negative topics
pyLDAvis.gensim.prepare(neg_lda, neg_doc_term_mx, neg_dictionary, R = 20)

- Topic 1 in pyLDA vis corresponds to Topic 4 in lda print results 
- Topic 2 in pyLDA vis corresponds to Topic 1 in lda print results 
- Topic 3 in pyLDA vis corresponds to Topic 3 in lda print results 
- Topic 4 in pyLDA vis corresponds to Topic 0 in lda print results 
- Topic 5 in pyLDA vis corresponds to Topic 2 in lda print results 

The left panel of the chart presents a global view of topics: how they are related to each other (far apart or inmtersect) and how preevalent they are (the larger the circle the more prevalent is the topic). 

THe right panel shows  the individual terms that are the most useful for interpreting the currently selected topic on the left.  A pair of overlaid bars represent both the corpus-wide frequency ofa given term as well as the topic-specific frequency of the term.

Parameter lambda regulates the _relevance_ of the term, which is used to rank terms within topics. It is regulated by parameter _lambda_.
- If _lambda_ = 1, we'll see familiar ranking of terms in decreasing order of their topic-specific probability
- If _lambda_ = 0, we'll rank words soleley by their _lift_, defined as the ratio of a term’s probability within a topic to its marginal probability across the corpus. In other words, it helps determine the words exclusivity to a particular topic. 

In [324]:
# Topics based on positive reviews
pos_lda.print_topics()

[(0,
  '0.030*"great" + 0.018*"use" + 0.018*"doctor" + 0.018*"need" + 0.017*"test_result" + 0.016*"like" + 0.014*"love" + 0.013*"medic" + 0.013*"inform" + 0.013*"info"'),
 (1,
  '0.020*"user_friendli" + 0.014*"doctor" + 0.013*"great" + 0.011*"like" + 0.011*"inform" + 0.011*"love" + 0.011*"track" + 0.011*"abl" + 0.010*"record" + 0.010*"use"'),
 (2,
  '0.034*"love" + 0.027*"good" + 0.017*"work" + 0.017*"need" + 0.015*"nice" + 0.013*"inform" + 0.013*"like" + 0.013*"great" + 0.013*"easi" + 0.012*"conveni"'),
 (3,
  '0.030*"test_result" + 0.018*"great" + 0.018*"appoint" + 0.018*"doctor" + 0.016*"use" + 0.015*"medic" + 0.015*"result" + 0.014*"make" + 0.014*"help" + 0.013*"time"'),
 (4,
  '0.051*"easi" + 0.027*"medic" + 0.022*"medic_record" + 0.019*"easi_navig" + 0.016*"great" + 0.014*"doctor" + 0.013*"medic_histori" + 0.013*"love" + 0.011*"appoint" + 0.011*"time"')]

__Suggested topics based on positive reviews__
- Topic 0: great, test results ==> FEATURE: access to  TEST RESULTS
- Topic 1: user friendly, great ==> good UX DESIGN
FEATURE: access to MEDICAL RECORDS
- Topic 2: Love, easy to use ==> 
- Topic 3: test results, doctor appointments ==> DOCTOR APPOINTMENTS 
- Topic 4: easy to navigate, medical records ==> access to MEDICAL RECORDS


In [323]:
#Visualize positive topics
pyLDAvis.gensim.prepare(pos_lda, pos_doc_term_mx, pos_dictionary, R = 20)

- Topic 1 in pyLDA vis corresponds to Topic 0 in lda print results 
- Topic 2 in pyLDA vis corresponds to Topic 2 in lda print results 
- Topic 3 in pyLDA vis corresponds to Topic 1 in lda print results 
- Topic 4 in pyLDA vis corresponds to Topic 4 in lda print results 
- Topic 5 in pyLDA vis corresponds to Topic 3 in lda print results 
