In [2]:
from wordcloud import WordCloud, STOPWORDS
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, log_loss

In [3]:
with open('alexa_reviews_clean.pkl','rb') as read_file:
    df = pickle.load(read_file)

In [4]:
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback,review_length,new_reviews,sentiment
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1,13,love echo,positive
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1,9,love,positive
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1,195,sometimes play game answer question correctly ...,positive
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1,172,lot fun thing 4 yr old learn dinosaur control ...,positive
4,5,31-Jul-18,Charcoal Fabric,Music,1,5,music,positive


## Variations

In [5]:
df['variation'].value_counts()

Black  Dot                      516
Charcoal Fabric                 430
Configuration: Fire TV Stick    350
Black  Plus                     270
Black  Show                     265
Black                           261
Black  Spot                     241
White  Dot                      184
Heather Gray Fabric             157
White  Spot                     109
White                            91
Sandstone Fabric                 90
White  Show                      85
White  Plus                      78
Oak Finish                       14
Walnut Finish                     9
Name: variation, dtype: int64

In [6]:
df=df[df.variation!='Configuration: Fire TV Stick']
df['variation'].value_counts()

Black  Dot              516
Charcoal Fabric         430
Black  Plus             270
Black  Show             265
Black                   261
Black  Spot             241
White  Dot              184
Heather Gray Fabric     157
White  Spot             109
White                    91
Sandstone Fabric         90
White  Show              85
White  Plus              78
Oak Finish               14
Walnut Finish             9
Name: variation, dtype: int64

In [7]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 20)


  """Entry point for launching an IPython kernel.


In [8]:
df[df['variation']=='White'].sample(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback,review_length,new_reviews,sentiment
693,5,17-May-18,White,Works perfect! There’s no difference at all between this refurb one and my new ones that I have in my house. I’ll be buying the refurbs from here on out,1,152,work perfect there s difference refurb one new one house ill buying refurb,positive
541,5,16-Jun-18,White,Well I’m a big fan of echoes these ones went I’m my bathrooms. Works just great as expected 👍🏾,1,95,well -PRON- be big fan echo one go -PRON- be bathroom work great expect,positive
456,5,2-Jul-18,White,"Works great, never had any issues.",1,34,work great never issue,positive
675,5,20-May-18,White,Best price we could find and it is so useful to have her around to answer questions. It is so handy to not have to find your phone. I also now have an intercom system.,1,167,good price could find useful around answer question handy find phone also intercom system,positive
441,4,7-Jul-18,White,Love the product. Use to turn on my lights and answer some quick questions.,1,76,love product use turn light answer quick question,positive
517,5,21-Jun-18,White,"I think I now have 5 of these throughout the house. So convenient to set a timer or check the weather or whatever, wherever.",1,124,think 5 throughout house convenient set timer check weather whatever wherever,positive
397,5,19-Jul-18,White,Perfect works great no problems with it being refurbished.,1,58,perfect work great problem refurbish,positive
683,4,19-May-18,White,I had a brand new echo Dot and thought that the refurbished would be just as good. It was quite different from my expectations. Refurbished unit did not even turn on and after a while I realized that the power adapter was faulty. There is no way for me to have Amazon send me a power adapter by itself according to the return options. I love Amazon and buy even my toilet paper through them but this was truly a let down.Update: Amazon contacted me and credited me enough to buy a new power adapter. This is why I shop on Amazon. I can always trust them to deliver on their products.,1,583,brand new echo dot thought refurbish would good quite different expectation refurbish unit even turn realize power adapter faulty way amazon send power adapter accord return option love amazon buy even toilet paper truly let downupdate amazon contacted credit enough buy new power adapter shop amazon always trust deliver product,positive
390,5,23-Jul-18,White,"This was an add on for my echo system, the refurbished product works like new!",1,78,add echo system refurbish product work like new,positive
526,1,20-Jun-18,White,Not good at all!,0,16,good,negative


In [9]:
# CHANGE VARIATION NAMES TO NAME OF ECHO MODELS

# ECHO 2nd Gen - charcoal fabric, heather gray fabric, 
# sandstone fabric, oak finish, walnut finish
df['model']=np.where(df.variation.str.contains('Charcoal Fabric ') |
                     df.variation.str.contains('Heather Gray Fabric ') |
                     df.variation.str.contains('Sandstone Fabric ') |
                     df.variation.str.contains('Oak Finish ') |
                     df.variation.str.contains('Walnut Finish '),'echo',df['variation'])

# ECHO DOT - black dot, white dot, black, white
df['model']=np.where(df.variation.str.contains('Black  Dot') |
                    df.variation.str.contains('White  Dot') |
                    df.variation.str.contains('Black') |
                    df.variation.str.contains('White'), 'echo dot', df['model'])

# ECHO SHOW - black show, white show
df['model']=np.where(df.variation.str.contains('Black  Show') |
                    df.variation.str.contains('White  Show'), 'echo show', df['model'])

# ECHO PLUS - black plus, white plus
df['model']=np.where(df.variation.str.contains('Black  Plus') |
                    df.variation.str.contains('White  Plus'), 'echo plus', df['model'])

# ECHO SPOT - black spot, white spot
df['model']=np.where(df.variation.str.contains('Black  Spot') |
                    df.variation.str.contains('White  Spot'), 'echo spot', df['model'])


In [10]:
df.sample(10)

Unnamed: 0,rating,date,variation,verified_reviews,feedback,review_length,new_reviews,sentiment,model
3026,4,30-Jul-18,White Dot,"We are still testing this one out and so far, we like it. I was very hesitant about putting a voice activated device in our home and this would be our first. Unfortunately, it sometimes gets our WiFi light bulbs mixed up and won't turn them on but we've been able to manage and fix it. I mean, it's convenient, especially for those unable to reach to turn off lights or electronics that you could hook up to the Dot. I do like that it plays music and has reminders, alarms, etc - I'm just still getting used to the fact that The Man is always listening now.",1,557,still test one far like hesitant putting voice activate device home would first unfortunately sometimes get wifi light bulb mixed will not turn -PRON- have able manage fix mean convenient especially unable reach turn light electronic could hook dot like play music reminder alarm etc -PRON- be still get use fact man always listen,positive,echo dot
2616,5,30-Jul-18,White Dot,I love it! I use it for everything. It’s a part of my daily routine,1,67,love use everything part daily routine,positive,echo dot
146,5,30-Jul-18,Charcoal Fabric,I have had Alexa since the beginning. I accidentally spilled water on her and have replaced it with the 2nd generation. I decided I couldn't be without an Echo. Setup was easy and had it working within minutes.,1,213,alexa since begin accidentally spill water replace 2nd generation decide could not without echo setup easy work within minute,positive,echo
75,5,30-Jul-18,Charcoal Fabric,It’s awesome,1,12,awesome,positive,echo
2073,3,5-Jul-18,Black,,1,1,,negative,echo dot
833,5,30-Jul-18,Heather Gray Fabric,"I love the sound quality of this unit, it is a very clear sound.",1,64,love sound quality unit clear sound,positive,echo
1708,4,28-Jul-18,Black Show,I bought this specifically so my wife could look at our Arlo security cameras from the bedroom when I’m at work. The system worked for three days and now it just gives me a error message. Very disappointed due to the fact that they advertise that these systems work together. I gave four stars because it does everything else very well.,1,336,buy specifically wife could look arlo security camera bedroom -PRON- be work system work three day give error message disappoint due fact advertise system work together give four star everything else well,positive,echo show
890,5,29-Jul-18,Charcoal Fabric,Works as advertised,1,19,work advertise,positive,echo
1672,5,28-Jul-18,Black Show,I like the variety of stuff that the echo show offers.,1,54,like variety stuff echo show offer,positive,echo show
417,5,13-Jul-18,Black,Thanks,1,6,thanks,positive,echo dot


# Corpus

In [None]:
# CREATE DICTIONARY TO COUNT THE WORDS
count_dict_alex = {}

for doc in df['new_reviews']:
    for word in doc.split():
        if word in count_dict_alex.keys():
            count_dict_alex[word] +=1
        else:
            count_dict_alex[word] = 1
            
for key, value in sorted(count_dict_alex.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

In [None]:
# REMOVE WORDS THAT OCCUR LESS THAN 10 TIMES
low_value = 10
bad_words = [key for key in count_dict_alex.keys() if count_dict_alex[key] < low_value]

In [None]:
# CREATE A LIST OF LISTS - EACH DOCUMENT IS A STRING BROKEN INTO A LIST OF WORDS
corpus = [doc.split() for doc in df['new_reviews']]
clean_list = []
for document in corpus:
    clean_list.append([word for word in document if word not in bad_words])

In [None]:
clean_list

In [None]:
# USE CLEAN_LIST TO CREATE CORPUS
corpus=[]
for item in clean_list:
    item = ' '.join(item)
    corpus.append(item)
    
corpus

# LDA

In [None]:
import gensim
from gensim import corpora
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.utils import simple_preprocess, lemmatize
from nltk.corpus import stopwords

In [None]:
clean_list[0][:5]

In [None]:
# CREATE THE INPUTS OF LDA MODEL: DICTIONARY AND CORPUS
corpora_dict = corpora.Dictionary(clean_list)
corpus = [corpora_dict.doc2bow(line) for line in clean_list]

In [None]:
# TRAIN THE LDA MODEL
lda_model = LdaModel(corpus=corpus,
                         id2word=corpora_dict,
                         random_state=100,
                         num_topics=3,
                         passes=5,
                         per_word_topics=True)

# See the topics
lda_model.print_topics(-1)

# Word2Vec

In [None]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api

In [None]:
# Train Word2Vec model. Defaults result vector size = 100
model = Word2Vec(clean_list, min_count = 0, workers=cpu_count())

# Get the word vector for given word
model['sound']

model.most_similar('sound')

# Bigram

In [None]:
dct = corpora.Dictionary(clean_list)
corpus = [dct.doc2bow(line) for line in clean_list]

# Build the bigram models
bigram = gensim.models.phrases.Phrases(clean_list, min_count=3, threshold=10)

# Construct bigram
print(bigram[clean_list[1]])

# Vader

In [None]:
analyser = SentimentIntensityAnalyzer()

In [None]:
def sentimentScore(sentences):
    analyzer = SentimentIntensityAnalyzer()
    results = []
    for sentence in sentences:
        vs = analyzer.polarity_scores(sentence)
        print(str(vs))
        results.append(vs) 
    return results

In [None]:
sentiment = sentimentScore(df['new_reviews'])


In [None]:
sentiment_df = pd.DataFrame(sentiment)
sentiment_df.head()

In [None]:
# align index to copy rating column for joining
df.index = sentiment_df.index
sentiment_df['rating'] = df['rating']
echo_vader = pd.concat([df, sentiment_df], axis=1)
echo_vader.head()

In [None]:
#postive sentiment
color = ['#63ace5']
ax = echo_vader.groupby("variation").pos.mean().plot.bar(color = color, figsize = (9, 6))

plt.title('Positive Sentiment', fontsize = 20, weight='bold')

# plt.xlabel('Variation', fontsize = 16, weight='bold')
plt.xticks(rotation='90', fontsize=14, weight='bold')
ax.xaxis.label.set_visible(False)

plt.ylabel('Sentiment Rating', fontsize=16, weight='bold')
ax.set_ylim([0,0.5])
plt.yticks(fontsize=14)


ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)


fig = plt.gcf()
plt.show()
plt.draw()

fig.savefig('postive_sentiment.jpg')
;

In [None]:
#postive sentiment
color = ['#005b96']
ax = echo_vader.groupby("variation").neg.mean().plot.bar(color = color, figsize = (9, 6))

plt.title('Negative Sentiment', fontsize = 20, weight='bold')

# plt.xlabel('Variation', fontsize = 16, weight='bold')
plt.xticks(rotation='90', fontsize=14, weight='bold')
ax.xaxis.label.set_visible(False)

plt.ylabel('Sentiment Rating', fontsize=16, weight='bold')
ax.set_ylim([0,0.5])
plt.yticks(fontsize=14)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)


fig = plt.gcf()
plt.show()
plt.draw()

fig.savefig('negative_sentiment.jpg')
;

In [None]:
group = df.groupby('rating').count()
group['date']

color = plt.cm.bone(np.linspace(0, 1, 6))
ax = group['date'].plot.bar(color='#7c86ac', figsize = (10, 6))

plt.title('Echo Ratings', fontsize = 20, weight='bold')
plt.xlabel('Ratings', fontsize = 16, weight='bold')
plt.ylabel('Count', fontsize=16, weight='bold')

plt.xticks(rotation='0', fontsize=14)
plt.yticks(fontsize=14)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)

plt.show()

#change colour

# Sentiment Features

In [None]:
neg_alexa = df[df['sentiment']=='negative']
pos_alexa = df[df['sentiment']=='positive']

## Negative

In [None]:
from sklearn.feature_selection import chi2

tfidf_n = TfidfVectorizer(ngram_range=(2, 2))
X_tfidf_n = tfidf_n.fit_transform(neg_alexa['new_reviews'])
y_n = neg_alexa['rating']
chi2score_n = chi2(X_tfidf_n, y_n)[0]

In [None]:
scores = list(zip(tfidf_n.get_feature_names(), chi2score_n))
chi2_n = sorted(scores, key=lambda x:x[1])
topchi2_n = list(zip(*chi2_n[-10:]))
x_n=range(len(topchi2_n[1]))

fig, ax = plt.subplots(figsize=(16,9))
ax.barh(x_n, topchi2_n[1], align='center', alpha=1, color='salmon')

plt.title('Alexa Negative Feedback', fontsize=24, weight='bold')

# x-axis
plt.xlabel("Feature Score", fontsize=22, weight='bold')
plt.xticks(fontsize=18)

#y-axis
labels = topchi2_n[0]
plt.yticks(x_n, labels, fontsize=18)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)

fig = plt.gcf()
plt.show()
plt.draw()

fig.savefig('alexa_neg.jpg')
;

## Positive

In [None]:
tfidf = TfidfVectorizer(ngram_range=(2, 2))
X_tfidf = tfidf.fit_transform(pos_alexa['new_reviews'])
y = pos_alexa['rating']
chi2score = chi2(X_tfidf, y)[0]

In [None]:
scores = list(zip(tfidf.get_feature_names(), chi2score))
chi2 = sorted(scores, key=lambda x:x[1])
topchi2 = list(zip(*chi2[-10:]))
x=range(len(topchi2[1]))

fig, ax = plt.subplots(figsize=(16,9))
ax.barh(x,topchi2[1], alpha=1, color='darkseagreen')

plt.title('Alexa Postive Feedback', fontsize=24, weight='bold')

# x-axis
plt.xlabel("Feature Score", fontsize=22, weight='bold')
plt.xticks(fontsize=18)

#y-axis
labels = topchi2[0]
plt.yticks(x, labels, fontsize=18)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)

fig = plt.gcf()
plt.show()
plt.draw()

fig.savefig('alexa_positive.jpg')
;