## Voice of Customer Analysis

In [45]:
# Importing Required Libraries
import pandas as pd
import re
import string
import nltk
import numpy as np

from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import ToktokTokenizer
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

import pyLDAvis
from pyLDAvis import gensim as gensim_vis
import warnings
warnings.simplefilter('ignore')

pyLDAvis.enable_notebook()

np.random.seed(100)

#### 1. Read the .csv file using Pandas. Take a look at the top few records.


In [4]:
reviews_data = pd.read_csv("dataset/K8 Reviews v0.2.csv")

In [5]:
reviews_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14675 entries, 0 to 14674
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  14675 non-null  int64 
 1   review     14675 non-null  object
dtypes: int64(1), object(1)
memory usage: 229.4+ KB


In [6]:
reviews_data.head(10)

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...
5,0,Only I'm telling don't buyI'm totally disappoi...
6,1,"Phone is awesome. But while charging, it heats..."
7,0,The battery level has worn down
8,0,It's over hitting problems...and phone hanging...
9,0,A lot of glitches dont buy this thing better g...


#### 2. Normalize casings for the review text and extract the text into a list for easier manipulation.

In [8]:
# Let's do some text pre-processing
# Handler Functions for Text Preprocessing

token = ToktokTokenizer()
lemma = WordNetLemmatizer()
# nltk.download("wordnet")
punct = '!"#$%&\'()*+,./:;<=>?@[\\]^`{|}~0123456789'
stop_words = set(stopwords.words("english"))

def clean_text(text):
    # Clean Text
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\r", " ", text)
    text = re.sub(r"<td>", " ", text)
    text = re.sub(r"</td>", " ", text)
    text = re.sub(r"<tr>", " ", text)
    text = re.sub(r"</tr>", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def remove_emoji(text):
    emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags 
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def strip_list_noempty(mylist):
    newlist = (item.strip() if hasattr(item, 'strip') else item for item in mylist)
    return [item for item in newlist if item != '']

def clean_punct(text): 
    words=token.tokenize(text)
    punctuation_filtered = []
    regex = re.compile('[%s]' % re.escape(punct))
    remove_punctuation = str.maketrans(' ', ' ', punct)
    for w in words:
        punctuation_filtered.append(regex.sub('', w))   
    filtered_list = strip_list_noempty(punctuation_filtered)
    return ' '.join(map(str, filtered_list))

In [9]:
# Converting Review Text to Lowercase and few more Text cleaning
reviews_data.loc[:, 'review'] = reviews_data['review'].apply(lambda x: clean_text(x))

In [10]:
# Removing Emojis
reviews_data.loc[:, 'review'] = reviews_data['review'].apply(lambda x: remove_emoji(x))

In [11]:
# cleaning Punctuations
reviews_data.loc[:, 'review'] = reviews_data['review'].apply(lambda x: clean_punct(x))

In [12]:
# Counting number of Words in each review
reviews_data['number_of_words'] = reviews_data.review.apply(lambda x: len(x.split()))

In [13]:
reviews_data.head()

Unnamed: 0,sentiment,review,number_of_words
0,1,good but need updates and improvements,6
1,0,worst mobile i have bought ever battery is dra...,88
2,1,when i will get my cash back its already january,10
3,1,good,1
4,0,the worst phone everthey have changed the last...,27


In [14]:
# Remiving Reviews with One and Two Words
required_dataset = reviews_data[reviews_data['number_of_words']>2]

In [15]:
required_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12118 entries, 0 to 14674
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   sentiment        12118 non-null  int64 
 1   review           12118 non-null  object
 2   number_of_words  12118 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 378.7+ KB


In [16]:
# Converting Reviews to List
reviews = required_dataset['review'].to_list()
type(reviews)

list

In [17]:
reviews[:5]

['good but need updates and improvements',
 'worst mobile i have bought ever battery is draining like hell backup is only to hours with internet uses even if i put mobile idle its getting dischargedthis is biggest lie from amazon amp lenove which is not at all expected they are making full by saying that battery is mah amp booster charger is fake it takes at least to hours to be fully chargeddo not know how lenovo will survive by making full of usplease don t go for this else you will regret like me',
 'when i will get my cash back its already january',
 'the worst phone everthey have changed the last phone but the problem is still same and the amazon is not returning the phone highly disappointing of amazon',
 'only i am telling do not buyi am totally disappointedpoor batterypoor camerawaste of money']

#### 3. Tokenize the reviews using NLTKs word_tokenize function.

In [18]:
review_tokens = [word_tokenize(item) for item in reviews]

In [19]:
type(review_tokens)

list

In [21]:
print(review_tokens[:5])

[['good', 'but', 'need', 'updates', 'and', 'improvements'], ['worst', 'mobile', 'i', 'have', 'bought', 'ever', 'battery', 'is', 'draining', 'like', 'hell', 'backup', 'is', 'only', 'to', 'hours', 'with', 'internet', 'uses', 'even', 'if', 'i', 'put', 'mobile', 'idle', 'its', 'getting', 'dischargedthis', 'is', 'biggest', 'lie', 'from', 'amazon', 'amp', 'lenove', 'which', 'is', 'not', 'at', 'all', 'expected', 'they', 'are', 'making', 'full', 'by', 'saying', 'that', 'battery', 'is', 'mah', 'amp', 'booster', 'charger', 'is', 'fake', 'it', 'takes', 'at', 'least', 'to', 'hours', 'to', 'be', 'fully', 'chargeddo', 'not', 'know', 'how', 'lenovo', 'will', 'survive', 'by', 'making', 'full', 'of', 'usplease', 'don', 't', 'go', 'for', 'this', 'else', 'you', 'will', 'regret', 'like', 'me'], ['when', 'i', 'will', 'get', 'my', 'cash', 'back', 'its', 'already', 'january'], ['the', 'worst', 'phone', 'everthey', 'have', 'changed', 'the', 'last', 'phone', 'but', 'the', 'problem', 'is', 'still', 'same', 'and

#### 4. Perform parts-of-speech tagging on each sentence using the NLTK POS tagger.

In [27]:
review_postags = [pos_tag(item) for item in review_tokens]
print(review_postags[:2])

[[('good', 'JJ'), ('but', 'CC'), ('need', 'VBP'), ('updates', 'NNS'), ('and', 'CC'), ('improvements', 'NNS')], [('worst', 'JJS'), ('mobile', 'NN'), ('i', 'NN'), ('have', 'VBP'), ('bought', 'VBN'), ('ever', 'RB'), ('battery', 'NN'), ('is', 'VBZ'), ('draining', 'VBG'), ('like', 'IN'), ('hell', 'NN'), ('backup', 'NN'), ('is', 'VBZ'), ('only', 'RB'), ('to', 'TO'), ('hours', 'NNS'), ('with', 'IN'), ('internet', 'NN'), ('uses', 'NNS'), ('even', 'RB'), ('if', 'IN'), ('i', 'JJ'), ('put', 'VBP'), ('mobile', 'JJ'), ('idle', 'NN'), ('its', 'PRP$'), ('getting', 'VBG'), ('dischargedthis', 'NN'), ('is', 'VBZ'), ('biggest', 'JJS'), ('lie', 'NN'), ('from', 'IN'), ('amazon', 'NN'), ('amp', 'NN'), ('lenove', 'NN'), ('which', 'WDT'), ('is', 'VBZ'), ('not', 'RB'), ('at', 'IN'), ('all', 'DT'), ('expected', 'VBN'), ('they', 'PRP'), ('are', 'VBP'), ('making', 'VBG'), ('full', 'JJ'), ('by', 'IN'), ('saying', 'VBG'), ('that', 'DT'), ('battery', 'NN'), ('is', 'VBZ'), ('mah', 'JJ'), ('amp', 'JJ'), ('booster', 'N

#### 5. For the topic model, we should  want to include only nouns.
    1. Find out all the POS tags that correspond to nouns.
    2. Limit the data to only terms with these tags.

In [28]:
noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']

review_postags_nouns = []

for item in review_postags:
    noun_tokens = [token_tag[0] for token_tag in item if token_tag[1] in noun_tags]
    review_postags_nouns.append(noun_tokens)
    
print(review_postags_nouns[:5])

[['updates', 'improvements'], ['mobile', 'i', 'battery', 'hell', 'backup', 'hours', 'internet', 'uses', 'idle', 'dischargedthis', 'lie', 'amazon', 'amp', 'lenove', 'battery', 'booster', 'charger', 'hours', 'don', 't'], ['i', 'cash'], ['phone', 'everthey', 'phone', 'problem', 'amazon', 'phone', 'amazon'], ['camerawaste', 'money']]


#### 6. Lemmatize. 
    1. Different forms of the terms need to be treated as one.
    2. No need to provide POS tag to lemmatizer for now.

In [29]:
wnl = WordNetLemmatizer()

review_postags_nouns_lemmed = []

for item in review_postags_nouns:
    lemmed_tokens = [wnl.lemmatize(token, 'n') for token in item]
    review_postags_nouns_lemmed.append(lemmed_tokens)
    
print(review_postags_nouns_lemmed[:5])

[['update', 'improvement'], ['mobile', 'i', 'battery', 'hell', 'backup', 'hour', 'internet', 'us', 'idle', 'dischargedthis', 'lie', 'amazon', 'amp', 'lenove', 'battery', 'booster', 'charger', 'hour', 'don', 't'], ['i', 'cash'], ['phone', 'everthey', 'phone', 'problem', 'amazon', 'phone', 'amazon'], ['camerawaste', 'money']]


#### 7. Remove stopwords and punctuation (if there are any). 

In [31]:
sw = stopwords.words("english")
punc = list(string.punctuation)

custom_sw = sw + punc

review_preprocessed = []

for item in review_postags_nouns_lemmed:
    if len(item)>0:        
        preprocessed_tokens = [token for token in item if token not in custom_sw and len(token)>1]
        review_preprocessed.append(preprocessed_tokens)
    else:
        review_preprocessed.append(item)
    
print(review_preprocessed[:5])

[['update', 'improvement'], ['mobile', 'battery', 'hell', 'backup', 'hour', 'internet', 'us', 'idle', 'dischargedthis', 'lie', 'amazon', 'amp', 'lenove', 'battery', 'booster', 'charger', 'hour'], ['cash'], ['phone', 'everthey', 'phone', 'problem', 'amazon', 'phone', 'amazon'], ['camerawaste', 'money']]


#### 8. Create a topic model using LDA on the cleaned up data with 12 topics.
    1. Print out the top terms for each topic.
    2. What is the coherence of the model with the c_v metric?

In [33]:
dictionary = Dictionary(review_preprocessed)
type(dictionary)

gensim.corpora.dictionary.Dictionary

In [34]:
dictionary.filter_extremes(no_below=5, no_above=.8 ,keep_n=None)
print(dictionary)

Dictionary<1118 unique tokens: ['improvement', 'update', 'amazon', 'amp', 'backup']...>


In [35]:
bow_text = [dictionary.doc2bow(item) for item in review_preprocessed]

In [36]:
bow_text[:5]

[[(0, 1), (1, 1)],
 [(2, 1),
  (3, 1),
  (4, 1),
  (5, 2),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 2),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1)],
 [(15, 1)],
 [(2, 2), (16, 3), (17, 1)],
 [(18, 1)]]

In [39]:
sample_bow = bow_text[20]

for item in sample_bow:
    print("Word '{}' comes {} times in this sample review".format(dictionary[item[0]], item[1]))

Word 'battery' comes 1 times in this sample review
Word 'phone' comes 1 times in this sample review
Word 'lenovo' comes 1 times in this sample review
Word 'product' comes 2 times in this sample review
Word 'camera' comes 1 times in this sample review


In [40]:
# LDA Model using Gensim
lda_model = gensim.models.LdaMulticore(bow_text, 
                                   num_topics = 12, 
                                   id2word = dictionary,
                                   random_state=1,                                    
                                   passes = 50)

In [41]:
for idx, topic in lda_model.print_topics(-1):
    print("\nTopic: {} \nWords: {}".format(idx, topic ))


Topic: 0 
Words: 0.328*"mobile" + 0.054*"hai" + 0.039*"box" + 0.020*"earphone" + 0.018*"ho" + 0.016*"hi" + 0.016*"bill" + 0.014*"note" + 0.014*"lenovo" + 0.014*"headset"

Topic: 1 
Words: 0.311*"phone" + 0.052*"call" + 0.025*"lenovo" + 0.025*"note" + 0.025*"option" + 0.021*"issue" + 0.017*"budget" + 0.015*"app" + 0.015*"feature" + 0.013*"apps"

Topic: 2 
Words: 0.104*"issue" + 0.050*"network" + 0.046*"device" + 0.035*"update" + 0.034*"sim" + 0.030*"use" + 0.028*"time" + 0.025*"day" + 0.023*"phone" + 0.019*"lenovo"

Topic: 3 
Words: 0.188*"money" + 0.082*"waste" + 0.073*"value" + 0.067*"phone" + 0.055*"handset" + 0.048*"month" + 0.043*"product" + 0.029*"superb" + 0.027*"worth" + 0.024*"item"

Topic: 4 
Words: 0.081*"charger" + 0.060*"speaker" + 0.048*"phone" + 0.046*"note" + 0.033*"turbo" + 0.022*"screen" + 0.020*"time" + 0.020*"lenovo" + 0.019*"hour" + 0.019*"power"

Topic: 5 
Words: 0.228*"camera" + 0.080*"phone" + 0.041*"mode" + 0.033*"battery" + 0.029*"quality" + 0.024*"depth" + 0.

In [43]:
# calculating Coherence Score
coherence_score = CoherenceModel(model=lda_model, texts=review_preprocessed, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_score.get_coherence()
print('Coherence Score for this LDA model is: ', coherence_lda)

Coherence Score for this LDA model is:  0.5949711742750197


#### 9. Analyze the topics through the business lens.
    1. Determine which of the topics can be combined.

In [46]:
# Visualization of Topic Models
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, bow_text, dictionary)
pyLDAvis.save_html(LDAvis_prepared, 'LDA_model_vis'+'.html')

> From the LDA Visualization, there is a high overlap between several topics. Hence the following topics can be combined together

1. Topics 3,4,7,9, 10
2. Topics 2,5,6,8
3. Topics 1
4. Topic 12
5. Topic 11

#### 10. Create topic model using LDA with what you think is the optimal number of topics
    1. What is the coherence of the model?

In [49]:
lda_model1 = gensim.models.LdaMulticore(bow_text, 
                                   num_topics = 5, 
                                   id2word = dictionary,  
                                   random_state=1,
                                   passes = 50)

for idx, topic in lda_model1.print_topics(-1):
    print("\nTopic: {} \nWords: {}".format(idx, topic ))


Topic: 0 
Words: 0.174*"camera" + 0.091*"quality" + 0.074*"mobile" + 0.024*"feature" + 0.024*"money" + 0.017*"hai" + 0.017*"value" + 0.017*"performance" + 0.016*"price" + 0.013*"display"

Topic: 1 
Words: 0.293*"phone" + 0.036*"price" + 0.030*"note" + 0.025*"feature" + 0.021*"lenovo" + 0.015*"range" + 0.013*"screen" + 0.012*"glass" + 0.011*"camera" + 0.011*"service"

Topic: 2 
Words: 0.064*"issue" + 0.032*"network" + 0.031*"time" + 0.030*"problem" + 0.028*"device" + 0.024*"mobile" + 0.023*"lenovo" + 0.021*"update" + 0.020*"call" + 0.019*"note"

Topic: 3 
Words: 0.181*"product" + 0.093*"problem" + 0.041*"amazon" + 0.038*"heating" + 0.032*"money" + 0.026*"service" + 0.023*"issue" + 0.023*"month" + 0.023*"waste" + 0.021*"battery"

Topic: 4 
Words: 0.141*"battery" + 0.057*"phone" + 0.056*"camera" + 0.036*"backup" + 0.027*"performance" + 0.025*"charger" + 0.024*"hour" + 0.020*"day" + 0.018*"life" + 0.017*"speaker"


In [50]:
coherence_score1 = CoherenceModel(model=lda_model1, texts=review_preprocessed, dictionary=dictionary, coherence='c_v')
coherence_lda1 = coherence_score1.get_coherence()
print('Coherence Score for new LDA model_1 is: ', coherence_lda1)

Coherence Score for new LDA model_1 is:  0.632227583634284


> Topic Number 5 is giving better coherence score compared to Topic number 6

#### 11. The business should  be able to interpret the topics.
    1. Name each of the identified topics.
    2. Create a table with the topic name and the top 10 terms in each to present to the  business.

> The following Topics could be inferred from the LDA model

* Topic 1: Camera Related
* Topic 2: Pricing Related
* Topic 3: Network and Call Related
* Topic 4: Product Issues like Service, Heating
* Topic 5: Battery Related

In [51]:
topic_words = {}

for idx, topic in lda_model1.print_topics(-1): 
    temp = []
    for item in topic.split('+'):
        item_alpha = [letter for letter in item if letter.isalpha()]
        temp.append("".join(item_alpha))    
    topic_words[('Topic_'+str(idx+1))] = temp

topic_table = pd.DataFrame(topic_words)    
topic_table.index = ['Word_'+str(i+1) for i in range(topic_table.shape[0])]
topic_table

Unnamed: 0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5
Word_1,camera,phone,issue,product,battery
Word_2,quality,price,network,problem,phone
Word_3,mobile,note,time,amazon,camera
Word_4,feature,feature,problem,heating,backup
Word_5,money,lenovo,device,money,performance
Word_6,hai,range,mobile,service,charger
Word_7,value,screen,lenovo,issue,hour
Word_8,performance,glass,update,month,day
Word_9,price,camera,call,waste,life
Word_10,display,service,note,battery,speaker


In [52]:
LDAvis_prepared1 = pyLDAvis.gensim.prepare(lda_model1, bow_text, dictionary)
pyLDAvis.save_html(LDAvis_prepared1, 'LDA_model_topic_5'+'.html')